+ echo Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//log_node31.txt
Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//log_node31.txt
+ export ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//ascend/31
+ ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//ascend/31
+ mkdir -p /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//ascend/31
+ DATA_PATH=/local_disk/cognitron_vl//configs/lcvlm_finetune_stage4.yaml
+ TOKENIZER_PATH=/data_4/models/Qwen/Qwen2.5-14B-Instruct/
+ CKPT_LOAD_DIR=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/
+ VIT_CKPT_LOAD_DIR=/
+ CKPT_SAVE_DIR=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//
+ rsync -avh /local_disk/cognitron_vl//configs/lcvlm_finetune_stage4.yaml /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743/
sending incremental file list

sent 71 bytes  received 12 bytes  166.00 bytes/sec
total size is 23.84K  speedup is 287.17
+ cd /local_disk/cognitron_vl/
+ rm -fr datasets
+ mkdir -p datasets
+ ln -s /data/data/ datasets/CV
+ ln -s /data/data/LLM datasets/LLM
+ ln -s /data/data/LMM datasets/LMM
+ source /local_disk/cognitron_vl//scripts/set_env_mg_npu.sh
++ source /usr/local/Ascend/driver/bin/setenv.bash
+++ DEP_INFO_FILE=/etc/ascend_install.info
+++ [[ -f /etc/ascend_install.info ]]
+++ . /etc/ascend_install.info
+++ DRV_LIB64_COMMON_LDPATH=/driver/lib64/common
+++ DRV_LIB64_DRV_LDPATH=/driver/lib64/driver
+++ DRV_LIB64_LDPATH=/driver/lib64
+++ export LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
+++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
++ source /usr/local/Ascend/ascend-toolkit/set_env.sh
+++ export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+++ ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
++++ arch
+++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial:
+++ export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+++ PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
+++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin
+++ export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
+++ ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
+++ export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
+++ ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
+++ export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
+++ TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
+++ export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
+++ ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
++ export HCCL_CONNECT_TIMEOUT=7200
++ HCCL_CONNECT_TIMEOUT=7200
++ export HCCL_EXEC_TIMEOUT=7200
++ HCCL_EXEC_TIMEOUT=7200
++ export COMBINED_ENABLE=1
++ COMBINED_ENABLE=1
++ export MULTI_STREAM_MEMORY_REUSE=1
++ MULTI_STREAM_MEMORY_REUSE=1
++ export HCCL_RDMA_TC=160
++ HCCL_RDMA_TC=160
++ export HCCL_RDMA_SL=5
++ HCCL_RDMA_SL=5
++ export HCCL_INTRA_PCIE_ENABLE=0
++ HCCL_INTRA_PCIE_ENABLE=0
++ export HCCL_INTRA_ROCE_ENABLE=1
++ HCCL_INTRA_ROCE_ENABLE=1
++ export HCCL_RDMA_TIMEOUT=20
++ HCCL_RDMA_TIMEOUT=20
++ export INF_NAN_MODE_ENABLE=1
++ INF_NAN_MODE_ENABLE=1
++ export DISTRIBUTED_BACKEND=hccl
++ DISTRIBUTED_BACKEND=hccl
++ export ASCEND_LAUNCH_BLOCKING=0
++ ASCEND_LAUNCH_BLOCKING=0
++ export ASCEND_SLOG_PRINT_TO_STDOUT=0
++ ASCEND_SLOG_PRINT_TO_STDOUT=0
++ export ASCEND_GLOBAL_LOG_LEVEL=3
++ ASCEND_GLOBAL_LOG_LEVEL=3
++ export ASCEND_GLOBAL_EVENT_ENABLE=0
++ ASCEND_GLOBAL_EVENT_ENABLE=0
++ export TASK_QUEUE_ENABLE=1
++ TASK_QUEUE_ENABLE=1
++ export PTCOPY_ENABLE=1
++ PTCOPY_ENABLE=1
++ export COMBINED_ENABLE=1
++ COMBINED_ENABLE=1
++ export DYNAMIC_OP=ADD#MUL
++ DYNAMIC_OP=ADD#MUL
++ export HCCL_WHITELIST_DISABLE=1
++ HCCL_WHITELIST_DISABLE=1
++ export HCCL_CONNECT_TIMEOUT=7200
++ HCCL_CONNECT_TIMEOUT=7200
++ export HCCL_WHITELIST_DISABLE=1
++ HCCL_WHITELIST_DISABLE=1
++ export CUDA_DEVICE_MAX_CONNECTIONS=1
++ CUDA_DEVICE_MAX_CONNECTIONS=1
++ pip3 install --no-index --find-links=/data/software/ -r requirements_npu.txt
Looking in links: /data/software/
Processing data/software/expecttest-0.2.1-py3-none-any.whl (from -r requirements_npu.txt (line 1))
Requirement already satisfied: peft in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 2)) (0.7.0)
Processing data/software/XlsxWriter-3.2.0-py3-none-any.whl (from -r requirements_npu.txt (line 3))
Requirement already satisfied: termcolor in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 4)) (2.4.0)
Requirement already satisfied: tabulate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 5)) (0.9.0)
Processing data/software/tiktoken-0.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 6))
Requirement already satisfied: matplotlib in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 7)) (3.7.5)
Processing data/software/datasets-3.0.0-py3-none-any.whl (from -r requirements_npu.txt (line 8))
Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 9)) (0.7.0)
Processing data/software/pybind11-2.13.6-py3-none-any.whl (from -r requirements_npu.txt (line 10))
Requirement already satisfied: tensorboardX in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 11)) (2.6.2.2)
Processing data/software/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 12))
Requirement already satisfied: transformers>=4.40.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 13)) (4.40.1)
Requirement already satisfied: deepspeed>=0.14.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 14)) (0.14.5)
Processing data/software/accelerate-0.34.2-py3-none-any.whl (from -r requirements_npu.txt (line 15))
Requirement already satisfied: timm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 16)) (0.9.16)
Processing data/software/flask-3.0.3-py3-none-any.whl (from -r requirements_npu.txt (line 17))
Processing data/software/Flask_RESTful-0.3.10-py2.py3-none-any.whl (from -r requirements_npu.txt (line 18))
Processing data/software/decord-0.6.0-py3-none-manylinux2010_x86_64.whl (from -r requirements_npu.txt (line 19))
Processing data/software/natsort-8.4.0-py3-none-any.whl (from -r requirements_npu.txt (line 20))
Requirement already satisfied: numpy>=1.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (1.24.4)
Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (23.2)
Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.9.8)
Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.4.1)
Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (2.1.0+cpu)
Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (4.66.2)
Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.4.2)
Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.20.3)
Requirement already satisfied: regex>=2022.1.18 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2023.12.25)
Requirement already satisfied: requests>=2.26.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2.31.0)
Requirement already satisfied: contourpy>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.1.1)
Requirement already satisfied: cycler>=0.10 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (4.49.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.4.5)
Requirement already satisfied: pillow>=6.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (2.8.2)
Requirement already satisfied: importlib-resources>=3.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (6.1.2)
Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.13.1)
Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.3.7)
Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2.0.3)
Processing data/software/requests-2.32.3-py3-none-any.whl (from tiktoken->-r requirements_npu.txt (line 6))
Processing data/software/tqdm-4.67.1-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2))
Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.4.1)
Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.70.15)
Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2023.10.0)
Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.9.3)
Processing data/software/huggingface_hub-0.26.2-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2))
Requirement already satisfied: protobuf>=3.20 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tensorboardX->-r requirements_npu.txt (line 11)) (4.25.3)
Requirement already satisfied: tokenizers<0.20,>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers>=4.40.1->-r requirements_npu.txt (line 13)) (0.19.1)
Requirement already satisfied: hjson in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (3.1.0)
Requirement already satisfied: ninja in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.11.1.1)
Requirement already satisfied: nvidia-ml-py in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (12.560.30)
Requirement already satisfied: py-cpuinfo in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (9.0.0)
Requirement already satisfied: pydantic in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.10.15)
Processing data/software/safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from peft->-r requirements_npu.txt (line 2))
Requirement already satisfied: torchvision in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from timm->-r requirements_npu.txt (line 16)) (0.16.0)
Requirement already satisfied: Werkzeug>=3.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.0.1)
Requirement already satisfied: Jinja2>=3.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.1.3)
Processing data/software/itsdangerous-2.2.0-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17))
Requirement already satisfied: click>=8.1.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (8.1.7)
Processing data/software/blinker-1.8.2-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17))
Requirement already satisfied: importlib-metadata>=3.6.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (7.0.1)
Processing data/software/aniso8601-9.0.1-py2.py3-none-any.whl (from flask_restful->-r requirements_npu.txt (line 18))
Requirement already satisfied: six>=1.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (1.16.0)
Requirement already satisfied: pytz in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (2024.1)
Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.3.1)
Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (23.2.0)
Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.4.1)
Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (6.0.5)
Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.9.4)
Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (4.0.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft->-r requirements_npu.txt (line 2)) (4.10.0)
Requirement already satisfied: zipp>=0.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from importlib-metadata>=3.6.0->flask->-r requirements_npu.txt (line 17)) (3.17.0)
Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from Jinja2>=3.1.2->flask->-r requirements_npu.txt (line 17)) (2.1.5)
Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (1.26.18)
Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (2024.2.2)
Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.4)
Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (3.1)
Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2024.1)
Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.3.0)
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: aniso8601, xlsxwriter, tqdm, safetensors, requests, pybind11, pyarrow, natsort, itsdangerous, expecttest, decord, blinker, tiktoken, huggingface-hub, flask, flask_restful, accelerate, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.66.2
    Uninstalling tqdm-4.66.2:
      Successfully uninstalled tqdm-4.66.2
  Attempting uninstall: safetensors
    Found existing installation: safetensors 0.4.2
    Uninstalling safetensors-0.4.2:
      Successfully uninstalled safetensors-0.4.2
  Attempting uninstall: requests
    Found existing installation: requests 2.31.0
    Uninstalling requests-2.31.0:
      Successfully uninstalled requests-2.31.0
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 15.0.0
    Uninstalling pyarrow-15.0.0:
      Successfully uninstalled pyarrow-15.0.0
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.20.3
    Uninstalling huggingface-hub-0.20.3:
      Successfully uninstalled huggingface-hub-0.20.3
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.25.0
    Uninstalling accelerate-0.25.0:
      Successfully uninstalled accelerate-0.25.0
  Attempting uninstall: datasets
    Found existing installation: datasets 2.16.0
    Uninstalling datasets-2.16.0:
      Successfully uninstalled datasets-2.16.0
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tikit 1.8.2.240926 requires dicttoxml==1.7.4, which is not installed.
tikit 1.8.2.240926 requires docopt==0.6.2, which is not installed.
tikit 1.8.2.240926 requires future==0.18.2, which is not installed.
tikit 1.8.2.240926 requires hdfs==2.6.0, which is not installed.
tikit 1.8.2.240926 requires pure-sasl==0.6.2, which is not installed.
tikit 1.8.2.240926 requires py4j==0.10.7, which is not installed.
tikit 1.8.2.240926 requires PyHive[hive]==0.6.4, which is not installed.
tikit 1.8.2.240926 requires pyjwt>=2.4.0, which is not installed.
tikit 1.8.2.240926 requires requests-kerberos>=0.14.0, which is not installed.
tikit 1.8.2.240926 requires sasl==0.3.1, which is not installed.
tikit 1.8.2.240926 requires thrift==0.15.0, which is not installed.
tikit 1.8.2.240926 requires thrift-sasl>=0.1.0, which is not installed.
tikit 1.8.2.240926 requires certifi==2021.10.8, but you have certifi 2024.2.2 which is incompatible.
tikit 1.8.2.240926 requires cos-python-sdk-v5==1.9.29, but you have cos-python-sdk-v5 1.9.26 which is incompatible.
tikit 1.8.2.240926 requires idna==3.3, but you have idna 3.6 which is incompatible.
tikit 1.8.2.240926 requires prettytable==2.5.0, but you have prettytable 3.11.0 which is incompatible.
tikit 1.8.2.240926 requires urllib3==1.26.7, but you have urllib3 1.26.18 which is incompatible.
tikit 1.8.2.240926 requires wcwidth==0.2.5, but you have wcwidth 0.2.13 which is incompatible.
Successfully installed accelerate-0.34.2 aniso8601-9.0.1 blinker-1.8.2 datasets-3.0.0 decord-0.6.0 expecttest-0.2.1 flask-3.0.3 flask_restful-0.3.10 huggingface-hub-0.26.2 itsdangerous-2.2.0 natsort-8.4.0 pyarrow-17.0.0 pybind11-2.13.6 requests-2.32.3 safetensors-0.4.5 tiktoken-0.7.0 tqdm-4.67.1 xlsxwriter-3.2.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
++ return 0
+ MEGATRON_DIR=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/
+ MINDSPEED_DIR=/local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/
+ MODELLINK_DIR=/local_disk/cognitron_vl//third_party/ModelLink/
+ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/
Looking in links: /data/software/
Obtaining file://local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: megatron_core
  Building editable for megatron_core (pyproject.toml): started
  Building editable for megatron_core (pyproject.toml): finished with status 'done'
  Created wheel for megatron_core: filename=megatron_core-0.6.0-0.editable-cp38-cp38-linux_x86_64.whl size=8791 sha256=1c8a73544a768ff0759eb2db03ef8e548406a6700abe057332d8072922777a16
  Stored in directory: /tmp/pip-ephem-wheel-cache-f3i1665g/wheels/54/9c/d1/d2015aa0c34e791e64d65d19395e5a9a5528f0c63fd519b9ff
Successfully built megatron_core
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: megatron_core
Successfully installed megatron_core-0.6.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/
Looking in links: /data/software/
Obtaining file://local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
WARNING: Error parsing requirements for tokenizers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/tokenizers-0.19.1.dist-info/METADATA'
WARNING: Error parsing requirements for transformers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/transformers-4.40.1.dist-info/METADATA'
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: mindspeed
  Running setup.py develop for mindspeed
Successfully installed mindspeed-0.6.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+ pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/ModelLink/
Looking in links: /data/software/
Obtaining file://local_disk/cognitron_vl/third_party/ModelLink
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Requirement already satisfied: numpy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.24.4)
Processing data/software/transformers-4.43.2-py3-none-any.whl (from modellink==0.0.1)
Processing data/software/transformers-stream-generator-0.0.5.tar.gz (from modellink==0.0.1)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.4)
Requirement already satisfied: decorator in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (5.1.1)
Requirement already satisfied: scipy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.10.1)
Requirement already satisfied: sentencepiece in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.2.0)
Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0)
Requirement already satisfied: datasets in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (3.0.0)
Requirement already satisfied: pybind11 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (2.13.6)
Requirement already satisfied: accelerate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.34.2)
Requirement already satisfied: six in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.16.0)
Requirement already satisfied: protobuf in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (4.25.3)
Processing data/software/peft-0.7.1-py3-none-any.whl (from modellink==0.0.1)
Requirement already satisfied: tiktoken in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0)
Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (23.2)
Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.9.8)
Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.4.1)
Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (2.1.0+cpu)
Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (4.67.1)
Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.4.5)
Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.26.2)
Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (3.13.1)
Requirement already satisfied: regex!=2019.12.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2023.12.25)
Requirement already satisfied: requests in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2.32.3)
Processing data/software/tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from transformers==4.43.2->modellink==0.0.1)
Requirement already satisfied: pyarrow>=15.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (17.0.0)
Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.3.7)
Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (2.0.3)
Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.4.1)
Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.70.15)
Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets->modellink==0.0.1) (2023.10.0)
Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.9.3)
Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->modellink==0.0.1) (1.3.0)
Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.3.1)
Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (23.2.0)
Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.4.1)
Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (6.0.5)
Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.9.4)
Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (4.0.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft==0.7.1->modellink==0.0.1) (4.10.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (1.26.18)
Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (2024.2.2)
Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1)
Requirement already satisfied: jinja2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1.3)
Requirement already satisfied: python-dateutil>=2.8.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1)
Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1)
Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from jinja2->torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (2.1.5)
Building wheels for collected packages: transformers_stream_generator
  Building wheel for transformers_stream_generator (setup.py): started
  Building wheel for transformers_stream_generator (setup.py): finished with status 'done'
  Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12425 sha256=53a0efa1548230be4832bd2d5f76d2b932ac2ffee1961d12082c62ce27bcc265
  Stored in directory: /root/.cache/pip/wheels/56/8c/42/5381d9c36bc85f28982f4cf8f98dc44d37a6d6c04897a5cb7c
Successfully built transformers_stream_generator
DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
Installing collected packages: tokenizers, transformers, transformers_stream_generator, peft, modellink
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.3
    Uninstalling transformers-4.46.3:
      Successfully uninstalled transformers-4.46.3
  Attempting uninstall: peft
    Found existing installation: peft 0.7.0
    Uninstalling peft-0.7.0:
      Successfully uninstalled peft-0.7.0
  Running setup.py develop for modellink
Successfully installed modellink-0.0.1 peft-0.7.1 tokenizers-0.19.1 transformers-4.43.2 transformers_stream_generator-0.0.5
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
+ export PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+ PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:
+ GPUS_PER_NODE=16
+ NNODES=32
+ NODE_RANK=31
+ MASTER_PORT=34567
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+ PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+ VISION_SEQ_LENGTH=1025
+ IMAGE_TOKEN_LENGTH=256
+ IMAGE_SIZE=448
+ VISION_MODEL_TYPE=intern_300m
+ TP=8
+ PP=1
+ CP=8
+ CP_ALGO=megatron_cp_algo
+ CP_MASK=causal
+ DISTRIBUTED_ARGS='
    --nproc_per_node 16     --nnodes 32     --node_rank 31     --master_addr train-1198772881325351168-93vlj4s2getc-master-0.train-100034032793.svc.cluster.local     --master_port 34567
'
+ GPT_ARGS='
    --use-mcore-models     --tensor-model-parallel-size 8     --pipeline-model-parallel-size 1     --context-parallel-size 8     --context-parallel-algo megatron_cp_algo     --cp-attention-mask-type causal     --use-cp-send-recv-overlap     --no-create-attention-mask-in-dataloader     --sparse-mode 4     --sequence-parallel     --recompute-method block     --recompute-granularity full     --recompute-num-layers 48     --num-layers 48     --hidden-size 5120     --ffn-hidden-size 13824     --num-attention-heads 40     --group-query-attention     --num-query-groups 8     --tokenizer-type PretrainedFromHF     --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/     --seq-length 1048576     --max-position-embeddings 1048576     --micro-batch-size 1     --global-batch-size 8     --make-vocab-size-divisible-by 1     --padded-vocab-size 152064     --rotary-base 1000000.0     --lr 5.00e-6     --train-iters 500     --lr-decay-style cosine     --untie-embeddings-and-output-weights     --disable-bias-linear     --attention-dropout 0.0     --init-method-std 0.01     --hidden-dropout 0.0     --position-embedding-type rope     --normalization RMSNorm     --use-fused-rmsnorm     --norm-epsilon 1e-6     --swiglu     --use-flash-attn     --use-fused-rotary-pos-emb     --use-rotary-position-embeddings     --use-fused-swiglu     --use-mc2     --no-masked-softmax-fusion     --attention-softmax-in-fp32     --min-lr 1.00e-7     --weight-decay 0.0     --lr-warmup-fraction 0.03     --clip-grad 1.0     --adam-beta1 0.9     --adam-beta2 0.999     --add-qkv-bias     --initial-loss-scale 4096     --no-gradient-accumulation-fusion     --use-distributed-optimizer     --bf16     --overlap-grad-reduce     --finetune     --vision-model-freeze     --vision-model-type intern_300m     --vision-downsample-ratio 0.5     --vision-projector-type mlp     --vision-projector-pre-norm     --vision-process-type dynamic     --vision-normalize-type imagenet     --vision-seq-length 1025     --image-token-length 256     --image-size 448     --prompt-format qwen2     --is-instruction-dataset     --max-num-image 4096     --max-fps 1     --add-class-token     --min-patch-grid 1     --max-patch-grid 12     --logit-mask     --cross-dataset-joint '
+ DATA_ARGS='
    --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage4.yaml     --split 100,0,0     --data-seq-length 1048576     --num-workers 8 '
+ CKPT_ARGS='
    --load /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/     --vit-load /     --no-load-optim     --no-load-rng     --seed 42424242     --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743// '
+ OUTPUT_ARGS='
    --log-interval 1     --save-interval 20     --eval-interval 20     --eval-iters 0     --log-throughput     --distributed-timeout-minutes 120 '
+ torchrun --nproc_per_node 16 --nnodes 32 --node_rank 31 --master_addr train-1198772881325351168-93vlj4s2getc-master-0.train-100034032793.svc.cluster.local --master_port 34567 /local_disk/cognitron_vl//lcvlm_modellink/pretrain_lcvlm.py --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 8 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 1048576 --max-position-embeddings 1048576 --micro-batch-size 1 --global-batch-size 8 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 500 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-image 4096 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --logit-mask --cross-dataset-joint --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage4.yaml --split 100,0,0 --data-seq-length 1048576 --num-workers 8 --load /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/ --vit-load / --no-load-optim --no-load-rng --seed 42424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743// --log-interval 1 --save-interval 20 --eval-interval 20 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 --distributed-backend nccl
[2024-11-28 15:50:34,775] torch.distributed.run: [WARNING] 
[2024-11-28 15:50:34,775] torch.distributed.run: [WARNING] *****************************************
[2024-11-28 15:50:34,775] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
[2024-11-28 15:50:34,775] torch.distributed.run: [WARNING] *****************************************
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py38_cpu/adaptive_cp...
Emitting ninja build file /root/.cache/torch_extensions/py38_cpu/adaptive_cp/build.ninja...
Building extension module adaptive_cp...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root...
[1/2] c++ -MMD -MF adaptive_cp.o.d -DTORCH_EXTENSION_NAME=adaptive_cp -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/usr/local/Ascend/ascend-toolkit/latest/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/third_party -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/acl -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/inc -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/TH -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/THC -isystem /root/miniconda3/envs/py38/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -fPIC -pie -Wl,--disable-new-dtags,--rpath -s -O2 -c local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/ops/csrc/algorithm/adaptive_cp/adaptive_cp.cpp -o adaptive_cp.o 
[2/2] c++ adaptive_cp.o -shared -L/usr/local/Ascend/ascend-toolkit/latest/lib64 -lascendcl -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/lib -ltorch_npu -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o adaptive_cp.so
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
Loading extension module adaptive_cp...
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32
  warnings.warn("failed to generate the npu_matmul_add_fp32")
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
/root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
> compiling dataset index builder ...
make: Entering directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets'
make: Nothing to be done for 'default'.
make: Leaving directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets'
>>> done with dataset index builder. Compilation time: 0.483 seconds
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute Falsevision_projector_recompute False

vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_projector_recompute False
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.vision_model_freeze

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.vision_model_freeze
vision_model_freeze

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
vision_model_freeze
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


vision_model_freeze=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
vision_model_freeze=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.


=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)


=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.

=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
vision_model_freeze
=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.
=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.
=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.
=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.
model GPTVLModel(
  (external_feature_model): MegatronVisionModel(
    (vit): InternViTModel(
      (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (position_embeddings): Embedding(1025, 1024)
      (decoder): TransformerBlock(
        (layers): ModuleList(
          (0-23): 24 x InternViTTransformerLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (self_attention): SelfAttention(
              (core_attention): DotProductAttention(
                (scale_mask_softmax): FusedScaleMaskSoftmax()
                (attention_dropout): Dropout(p=0.0, inplace=False)
              )
              (linear_proj): RowParallelLinear()
              (linear_qkv): ColumnParallelLinear()
            )
            (self_attn_bda): IdentityFuncOp()
            (pre_cross_attn_layernorm): IdentityOp()
            (cross_attention): IdentityOp()
            (cross_attn_bda): IdentityFuncOp()
            (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
            (mlp): MLP(
              (linear_fc1): ColumnParallelLinear()
              (linear_fc2): RowParallelLinear()
            )
            (mlp_bda): IdentityFuncOp()
          )
        )
      )
    )
    (vision_projection): MultimodalProjector(
      (encoder): MLP(
        (linear_fc1): ColumnParallelLinear()
        (linear_fc2): RowParallelLinear()
      )
    )
    (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (embedding): LanguageModelEmbedding(
    (word_embeddings): VocabParallelEmbedding()
    (embedding_dropout): Dropout(p=0.0, inplace=False)
  )
  (rotary_pos_emb): RotaryEmbedding()
  (decoder): TransformerBlock(
    (layers): ModuleList(
      (0-47): 48 x TransformerLayer(
        (input_layernorm): RMSNorm()
        (self_attention): SelfAttention(
          (core_attention): DotProductAttention(
            (scale_mask_softmax): FusedScaleMaskSoftmax()
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (linear_proj): RowParallelLinear()
          (linear_qkv): ColumnParallelLinear()
          (q_layernorm): IdentityOp()
          (k_layernorm): IdentityOp()
        )
        (pre_cross_attn_layernorm): IdentityOp()
        (cross_attention): IdentityOp()
        (cross_attn_bda): IdentityFuncOp()
        (pre_mlp_layernorm): RMSNorm()
        (mlp): MLP(
          (linear_fc1): ColumnParallelLinear()
          (linear_fc2): RowParallelLinear()
        )
      )
    )
    (final_layernorm): RMSNorm()
  )
  (output_layer): ColumnParallelLinear()
)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)


_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)

_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)

_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)
_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)
_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)
_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration   _load_base_checkpoint iteration 1000_load_base_checkpoint iteration _load_base_checkpoint iteration
_load_base_checkpoint iteration _load_base_checkpoint release 100010001000


 1000_load_base_checkpoint release   _load_base_checkpoint iteration 10001000 

False_load_base_checkpoint iteration
1000 _load_base_checkpoint release
  _load_base_checkpoint release 10001000
1000
False
1000_load_base_checkpoint release_load_base_checkpoint release_load_base_checkpoint release 1000False 

False_load_base_checkpoint release1000 1000
 _load_base_checkpoint release_load_base_checkpoint release  

False  False

_load_base_checkpoint release 
False

_load_base_checkpoint release1000False

_load_base_checkpoint releaseFalse_load_base_checkpoint release False_load_base_checkpoint release_load_base_checkpoint release  
 
 False
False False
FalseFalse

False


_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_02/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_00/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_03/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_05/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_00/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_01/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_02/model_optim_rng.pt
_load_base_checkpoint_load_base_checkpoint  /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_07/model_optim_rng.pt/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_04/model_optim_rng.pt

_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_03/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_04/model_optim_rng.pt
_load_base_checkpoint_load_base_checkpoint_load_base_checkpoint   /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_06/model_optim_rng.pt/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_01/model_optim_rng.pt/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_06/model_optim_rng.pt


_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_05/model_optim_rng.pt
_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_07/model_optim_rng.pt
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
load_checkpoint iteration 0
load_checkpoint release False
strict True
(min, max) time across ranks (ms):
    load-checkpoint ................................: (35346.45, 35346.82)
> rank 506 does not create GPT datasets ...
> rank 507 does not create GPT datasets ...
> rank 499 does not create GPT datasets ...
> rank 505 does not create GPT datasets ...> rank 503 does not create GPT datasets ...

> rank 501 does not create GPT datasets ...
> rank 511 does not create GPT datasets ...> rank 509 does not create GPT datasets ...

> rank 497 does not create GPT datasets ...
> rank 498 does not create GPT datasets ...
> rank 504 is creating GPT datasets ...> rank 502 does not create GPT datasets ...> rank 508 does not create GPT datasets ...


> rank 510 does not create GPT datasets ...
> rank 500 does not create GPT datasets ...
> rank 496 is creating GPT datasets ...
target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)]
possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]]
target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)]
possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]]
(min, max) time across ranks (ms):
    model-and-optimizer-setup ......................: (35985.64, 35997.94)
    train/valid/test-data-iterators-setup ..........: (302758.09, 303131.07)
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51862cd80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51862cd80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
processed_samples 100 unjoint_samples 100 joint_samples 0 [136301, 185903]
processed_samples 100 unjoint_samples 100 joint_samples 0 [136301, 185903]
processed_samples 100 unjoint_samples 100 joint_samples 0 [161329, 159174]
processed_samples 100 unjoint_samples 100 joint_samples 0 [230777, 221579]
processed_samples 100 unjoint_samples 100 joint_samples 0 [135670, 136846]
processed_samples 100 unjoint_samples 100 joint_samples 0 [161329, 159174]
processed_samples 100 unjoint_samples 100 joint_samples 0 [230777, 221579]
processed_samples 100 unjoint_samples 100 joint_samples 0 [135670, 136846]
processed_samples 100 unjoint_samples 100 joint_samples 0 [185666, 185971]
processed_samples 100 unjoint_samples 100 joint_samples 0 [185666, 185971]
processed_samples 100 unjoint_samples 100 joint_samples 0 [136013, 137062]
processed_samples 100 unjoint_samples 100 joint_samples 0 [136013, 137062]
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 100 unjoint_samples 100 joint_samples 0 [144154, 142029]
processed_samples 100 unjoint_samples 100 joint_samples 0 [144154, 142029]
processed_samples 100 unjoint_samples 100 joint_samples 0 [142372, 140436]
processed_samples 100 unjoint_samples 100 joint_samples 0 [142372, 140436]
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
processed_samples 200 unjoint_samples 200 joint_samples 0 [442317, 476094]
processed_samples 200 unjoint_samples 200 joint_samples 0 [442317, 476094]
processed_samples 200 unjoint_samples 200 joint_samples 0 [304017, 303803]
processed_samples 200 unjoint_samples 200 joint_samples 0 [304017, 303803]
processed_samples 200 unjoint_samples 200 joint_samples 0 [308595, 302406]
processed_samples 200 unjoint_samples 200 joint_samples 0 [308595, 302406]
processed_samples 200 unjoint_samples 200 joint_samples 0 [301352, 305263]
processed_samples 200 unjoint_samples 200 joint_samples 0 [301352, 305263]
processed_samples 200 unjoint_samples 200 joint_samples 0 [317896, 339618]
processed_samples 200 unjoint_samples 200 joint_samples 0 [317896, 339618]
processed_samples 200 unjoint_samples 200 joint_samples 0 [394104, 382765]
processed_samples 200 unjoint_samples 200 joint_samples 0 [394104, 382765]
processed_samples 200 unjoint_samples 200 joint_samples 0 [276598, 277077]
processed_samples 200 unjoint_samples 200 joint_samples 0 [276598, 277077]
processed_samples 200 unjoint_samples 200 joint_samples 0 [361042, 360328]
processed_samples 200 unjoint_samples 200 joint_samples 0 [361042, 360328]
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b928840] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b928840] mmco: unref short failure
[h264 @ 0x56215b928840] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
processed_samples 300 unjoint_samples 300 joint_samples 0 [492229, 488923]
processed_samples 300 unjoint_samples 300 joint_samples 0 [492229, 488923]
processed_samples 300 unjoint_samples 300 joint_samples 0 [477837, 477325]
processed_samples 300 unjoint_samples 300 joint_samples 0 [477837, 477325]
processed_samples 300 unjoint_samples 300 joint_samples 0 [590298, 590047]
processed_samples 300 unjoint_samples 300 joint_samples 0 [590298, 590047]
processed_samples 300 unjoint_samples 300 joint_samples 0 [500107, 497881]
processed_samples 300 unjoint_samples 300 joint_samples 0 [500107, 497881]
processed_samples 300 unjoint_samples 300 joint_samples 0 [400463, 400576]
processed_samples 300 unjoint_samples 300 joint_samples 0 [400463, 400576]
processed_samples 300 unjoint_samples 300 joint_samples 0 [604230, 621018]
processed_samples 300 unjoint_samples 300 joint_samples 0 [604230, 621018]
processed_samples 300 unjoint_samples 300 joint_samples 0 [511598, 511244]
processed_samples 300 unjoint_samples 300 joint_samples 0 [511598, 511244]
processed_samples 300 unjoint_samples 300 joint_samples 0 [513168, 514079]
processed_samples 300 unjoint_samples 300 joint_samples 0 [513168, 514079]
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d51327b000] mmco: unref short failure
[h264 @ 0x55d51327b000] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
processed_samples 400 unjoint_samples 400 joint_samples 0 [645141, 644007]
processed_samples 400 unjoint_samples 400 joint_samples 0 [645141, 644007]
processed_samples 400 unjoint_samples 400 joint_samples 0 [682733, 684223]
processed_samples 400 unjoint_samples 400 joint_samples 0 [682733, 684223]
processed_samples 400 unjoint_samples 400 joint_samples 0 [629361, 635981]
processed_samples 400 unjoint_samples 400 joint_samples 0 [629361, 635981]
processed_samples 400 unjoint_samples 400 joint_samples 0 [672742, 688581]
processed_samples 400 unjoint_samples 400 joint_samples 0 [672742, 688581]
processed_samples 400 unjoint_samples 400 joint_samples 0 [773371, 774327]
processed_samples 400 unjoint_samples 400 joint_samples 0 [773371, 774327]
processed_samples 400 unjoint_samples 400 joint_samples 0 [707094, 705592]
processed_samples 400 unjoint_samples 400 joint_samples 0 [707094, 705592]
processed_samples 400 unjoint_samples 400 joint_samples 0 [658632, 655574]
processed_samples 400 unjoint_samples 400 joint_samples 0 [658632, 655574]
processed_samples 400 unjoint_samples 400 joint_samples 0 [821555, 816781]
processed_samples 400 unjoint_samples 400 joint_samples 0 [821555, 816781]
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d508162180] mmco: unref short failure
[h264 @ 0x55d508162180] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
processed_samples 500 unjoint_samples 500 joint_samples 0 [841955, 842678]
processed_samples 500 unjoint_samples 500 joint_samples 0 [841955, 842678]
processed_samples 500 unjoint_samples 500 joint_samples 0 [983147, 985847]
processed_samples 500 unjoint_samples 500 joint_samples 0 [881560, 881559]
processed_samples 500 unjoint_samples 500 joint_samples 0 [894887, 912251]
processed_samples 500 unjoint_samples 500 joint_samples 0 [881560, 881559]
processed_samples 500 unjoint_samples 500 joint_samples 0 [983147, 985847]
processed_samples 500 unjoint_samples 500 joint_samples 0 [894887, 912251]
processed_samples 500 unjoint_samples 500 joint_samples 0 [853383, 835984]
processed_samples 500 unjoint_samples 500 joint_samples 0 [853383, 835984]
processed_samples 500 unjoint_samples 500 joint_samples 0 [920412, 921212]
processed_samples 500 unjoint_samples 500 joint_samples 0 [920412, 921212]
processed_samples 500 unjoint_samples 500 joint_samples 0 [811098, 880230]
processed_samples 500 unjoint_samples 500 joint_samples 0 [811098, 880230]
processed_samples 500 unjoint_samples 500 joint_samples 0 [830129, 831340]
processed_samples 500 unjoint_samples 500 joint_samples 0 [830129, 831340]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
................................................................................................ [2024-11-28 16:16:41] iteration        1/     500 | consumed samples:            8 | elapsed time per iteration (ms): 1178502.3 | throughput per GPU (TFLOP/s/GPU): 44.2 | learning rate: 3.333333E-07 | global batch size:     8 | lm loss: 6.629787E+00 | loss scale: 1.0 | grad norm: 71.391 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b2e3080] mmco: unref short failure
[h264 @ 0x56215b2e3080] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
 [2024-11-28 16:24:22] iteration        2/     500 | consumed samples:           16 | elapsed time per iteration (ms): 460889.5 | throughput per GPU (TFLOP/s/GPU): 112.9 | learning rate: 6.666667E-07 | global batch size:     8 | lm loss: 6.407792E+00 | loss scale: 1.0 | grad norm: 50.194 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 16:30:23] iteration        3/     500 | consumed samples:           24 | elapsed time per iteration (ms): 360271.6 | throughput per GPU (TFLOP/s/GPU): 144.4 | learning rate: 1.000000E-06 | global batch size:     8 | lm loss: 6.310083E+00 | loss scale: 1.0 | grad norm: 43.834 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x55d513c2d080] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 600 unjoint_samples 600 joint_samples 1 [1046784, 140984]
processed_samples 600 unjoint_samples 600 joint_samples 1 [1046784, 140984]
processed_samples 600 unjoint_samples 600 joint_samples 1 [992692, 253508]
processed_samples 600 unjoint_samples 600 joint_samples 1 [992692, 253508]
processed_samples 600 unjoint_samples 600 joint_samples 1 [329003, 1016149]
processed_samples 600 unjoint_samples 600 joint_samples 1 [329003, 1016149]
processed_samples 600 unjoint_samples 600 joint_samples 0 [999002, 1001861]
processed_samples 600 unjoint_samples 600 joint_samples 0 [1023421, 1025360]
processed_samples 600 unjoint_samples 600 joint_samples 0 [935448, 933940]
processed_samples 600 unjoint_samples 600 joint_samples 0 [935448, 933940]
processed_samples 600 unjoint_samples 600 joint_samples 0 [1023421, 1025360]
processed_samples 600 unjoint_samples 600 joint_samples 0 [999002, 1001861]
processed_samples 600 unjoint_samples 600 joint_samples 0 [1028346, 1017978]
processed_samples 600 unjoint_samples 600 joint_samples 0 [1028346, 1017978]
processed_samples 600 unjoint_samples 600 joint_samples 0 [968364, 969654]
processed_samples 600 unjoint_samples 600 joint_samples 0 [968364, 969654]
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215de57ac0] mmco: unref short failure
[h264 @ 0x56215de57ac0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
 [2024-11-28 16:37:15] iteration        4/     500 | consumed samples:           32 | elapsed time per iteration (ms): 412089.2 | throughput per GPU (TFLOP/s/GPU): 126.3 | learning rate: 1.333333E-06 | global batch size:     8 | lm loss: 6.219399E+00 | loss scale: 1.0 | grad norm: 57.408 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
 [2024-11-28 16:43:37] iteration        5/     500 | consumed samples:           40 | elapsed time per iteration (ms): 382756.5 | throughput per GPU (TFLOP/s/GPU): 136.0 | learning rate: 1.666667E-06 | global batch size:     8 | lm loss: 6.177355E+00 | loss scale: 1.0 | grad norm: 40.531 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
 [2024-11-28 16:49:48] iteration        6/     500 | consumed samples:           48 | elapsed time per iteration (ms): 370317.2 | throughput per GPU (TFLOP/s/GPU): 140.5 | learning rate: 2.000000E-06 | global batch size:     8 | lm loss: 6.195541E+00 | loss scale: 1.0 | grad norm: 37.413 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
 [2024-11-28 16:55:57] iteration        7/     500 | consumed samples:           56 | elapsed time per iteration (ms): 369201.8 | throughput per GPU (TFLOP/s/GPU): 141.0 | learning rate: 2.333333E-06 | global batch size:     8 | lm loss: 6.063354E+00 | loss scale: 1.0 | grad norm: 77.346 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
 [2024-11-28 17:02:47] iteration        8/     500 | consumed samples:           64 | elapsed time per iteration (ms): 409604.1 | throughput per GPU (TFLOP/s/GPU): 127.1 | learning rate: 2.666667E-06 | global batch size:     8 | lm loss: 5.870256E+00 | loss scale: 1.0 | grad norm: 28.693 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
processed_samples 700 unjoint_samples 700 joint_samples 1 [252030, 979033]
processed_samples 700 unjoint_samples 700 joint_samples 1 [115459, 1044928]
processed_samples 700 unjoint_samples 700 joint_samples 1 [252030, 979033]
processed_samples 700 unjoint_samples 700 joint_samples 1 [115459, 1044928]
processed_samples 700 unjoint_samples 700 joint_samples 1 [332780, 1018922]
processed_samples 700 unjoint_samples 700 joint_samples 1 [332780, 1018922]
processed_samples 700 unjoint_samples 700 joint_samples 1 [344433, 1036861]
processed_samples 700 unjoint_samples 700 joint_samples 1 [344433, 1036861]
processed_samples 700 unjoint_samples 700 joint_samples 1 [191911, 1045441]
processed_samples 700 unjoint_samples 700 joint_samples 1 [191911, 1045441]
processed_samples 700 unjoint_samples 700 joint_samples 1 [643940, 1016149]
processed_samples 700 unjoint_samples 700 joint_samples 1 [643940, 1016149]
processed_samples 700 unjoint_samples 700 joint_samples 1 [1046784, 453302]
processed_samples 700 unjoint_samples 700 joint_samples 1 [1046784, 453302]
processed_samples 700 unjoint_samples 700 joint_samples 1 [992692, 517451]
processed_samples 700 unjoint_samples 700 joint_samples 1 [992692, 517451]
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
processed_samples 800 unjoint_samples 800 joint_samples 1 [600026, 1018922]
processed_samples 800 unjoint_samples 800 joint_samples 1 [600026, 1018922]
processed_samples 800 unjoint_samples 800 joint_samples 1 [480367, 1044928]
processed_samples 800 unjoint_samples 800 joint_samples 1 [480367, 1044928]
processed_samples 800 unjoint_samples 800 joint_samples 1 [1046784, 733804]
processed_samples 800 unjoint_samples 800 joint_samples 1 [443725, 1045441]
processed_samples 800 unjoint_samples 800 joint_samples 1 [443725, 1045441]
processed_samples 800 unjoint_samples 800 joint_samples 1 [980254, 1016149]
processed_samples 800 unjoint_samples 800 joint_samples 1 [721471, 1036861]
processed_samples 800 unjoint_samples 800 joint_samples 1 [1046784, 733804]
processed_samples 800 unjoint_samples 800 joint_samples 1 [992692, 843491]
processed_samples 800 unjoint_samples 800 joint_samples 1 [730050, 979033]
processed_samples 800 unjoint_samples 800 joint_samples 1 [721471, 1036861]
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
processed_samples 800 unjoint_samples 800 joint_samples 1 [992692, 843491]
processed_samples 800 unjoint_samples 800 joint_samples 1 [730050, 979033]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 800 unjoint_samples 800 joint_samples 1 [980254, 1016149]
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
 [2024-11-28 17:13:48] iteration        9/     500 | consumed samples:           72 | elapsed time per iteration (ms): 660898.0 | throughput per GPU (TFLOP/s/GPU): 78.7 | learning rate: 3.000000E-06 | global batch size:     8 | lm loss: 5.787996E+00 | loss scale: 1.0 | grad norm: 29.784 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
 [2024-11-28 17:21:06] iteration       10/     500 | consumed samples:           80 | elapsed time per iteration (ms): 438615.6 | throughput per GPU (TFLOP/s/GPU): 118.6 | learning rate: 3.333333E-06 | global batch size:     8 | lm loss: 5.712118E+00 | loss scale: 1.0 | grad norm: 43.059 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
 [2024-11-28 17:27:12] iteration       11/     500 | consumed samples:           88 | elapsed time per iteration (ms): 365964.4 | throughput per GPU (TFLOP/s/GPU): 142.2 | learning rate: 3.666667E-06 | global batch size:     8 | lm loss: 5.826314E+00 | loss scale: 1.0 | grad norm: 24.661 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
processed_samples 900 unjoint_samples 900 joint_samples 1 [685603, 1044928]
processed_samples 900 unjoint_samples 900 joint_samples 1 [685603, 1044928]
processed_samples 900 unjoint_samples 900 joint_samples 2 [1035512, 215070]
processed_samples 900 unjoint_samples 900 joint_samples 2 [1035512, 215070]
processed_samples 900 unjoint_samples 900 joint_samples 2 [169160, 1045271]
processed_samples 900 unjoint_samples 900 joint_samples 1 [870427, 1018922]
processed_samples 900 unjoint_samples 900 joint_samples 1 [870427, 1018922]
processed_samples 900 unjoint_samples 900 joint_samples 2 [169160, 1045271]
processed_samples 900 unjoint_samples 900 joint_samples 2 [44304, 1042650]
processed_samples 900 unjoint_samples 900 joint_samples 2 [44304, 1042650]
processed_samples 900 unjoint_samples 900 joint_samples 1 [1009966, 1036861]
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
processed_samples 900 unjoint_samples 900 joint_samples 1 [1009966, 1036861]
processed_samples 900 unjoint_samples 900 joint_samples 1 [1002137, 1001356]
processed_samples 900 unjoint_samples 900 joint_samples 1 [769375, 1045441]
processed_samples 900 unjoint_samples 900 joint_samples 1 [1002137, 1001356]
processed_samples 900 unjoint_samples 900 joint_samples 1 [769375, 1045441]
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
 [2024-11-28 17:34:31] iteration       12/     500 | consumed samples:           96 | elapsed time per iteration (ms): 439260.5 | throughput per GPU (TFLOP/s/GPU): 118.5 | learning rate: 4.000000E-06 | global batch size:     8 | lm loss: 5.277246E+00 | loss scale: 1.0 | grad norm: 25.843 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
 [2024-11-28 17:40:54] iteration       13/     500 | consumed samples:          104 | elapsed time per iteration (ms): 382309.0 | throughput per GPU (TFLOP/s/GPU): 136.1 | learning rate: 4.333333E-06 | global batch size:     8 | lm loss: 5.065854E+00 | loss scale: 1.0 | grad norm: 19.160 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
 [2024-11-28 17:47:00] iteration       14/     500 | consumed samples:          112 | elapsed time per iteration (ms): 366539.7 | throughput per GPU (TFLOP/s/GPU): 142.0 | learning rate: 4.666667E-06 | global batch size:     8 | lm loss: 4.861617E+00 | loss scale: 1.0 | grad norm: 18.576 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 17:54:35] iteration       15/     500 | consumed samples:          120 | elapsed time per iteration (ms): 454227.2 | throughput per GPU (TFLOP/s/GPU): 114.6 | learning rate: 5.000000E-06 | global batch size:     8 | lm loss: 5.125950E+00 | loss scale: 1.0 | grad norm: 21.127 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 18:00:40] iteration       16/     500 | consumed samples:          128 | elapsed time per iteration (ms): 365586.7 | throughput per GPU (TFLOP/s/GPU): 142.3 | learning rate: 4.999949E-06 | global batch size:     8 | lm loss: 4.770656E+00 | loss scale: 1.0 | grad norm: 16.546 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1045990, 46173]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1045990, 46173]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1029364, 115962]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1029364, 115962]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1047713, 111784]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1047713, 111784]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [84176, 1047539]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [84176, 1047539]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [389118, 1042650]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [389118, 1042650]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1035512, 460681]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1035512, 460681]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [542798, 1045271]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [542798, 1045271]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1036356, 343956]
processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1036356, 343956]
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215bac3ac0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1047713, 390717]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1047713, 390717]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1045990, 377675]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1029364, 428331]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1045990, 377675]
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1029364, 428331]
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [379155, 1047539]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [379155, 1047539]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [737670, 1042650]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [737670, 1042650]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [937076, 1045271]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1036356, 690281]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [937076, 1045271]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1036356, 690281]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1035512, 745640]
processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1035512, 745640]
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
 [2024-11-28 18:11:45] iteration       17/     500 | consumed samples:          136 | elapsed time per iteration (ms): 665218.5 | throughput per GPU (TFLOP/s/GPU): 78.2 | learning rate: 4.999794E-06 | global batch size:     8 | lm loss: 4.786225E+00 | loss scale: 1.0 | grad norm: 75.620 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1045990, 736951]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1029364, 716107]
processed_samples 1200 unjoint_samples 1200 joint_samples 3 [178926, 1047160]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1045990, 736951]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1029364, 716107]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1047713, 764007]
processed_samples 1200 unjoint_samples 1200 joint_samples 3 [178926, 1047160]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1036356, 1004222]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [963999, 1042650]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1047713, 764007]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1036356, 1004222]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1043823, 1042971]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [963999, 1042650]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [660021, 1047539]
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1043823, 1042971]
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
processed_samples 1200 unjoint_samples 1200 joint_samples 2 [660021, 1047539]
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
 [2024-11-28 18:19:51] iteration       18/     500 | consumed samples:          144 | elapsed time per iteration (ms): 485884.5 | throughput per GPU (TFLOP/s/GPU): 107.1 | learning rate: 4.999537E-06 | global batch size:     8 | lm loss: 4.659023E+00 | loss scale: 1.0 | grad norm: 14.797 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
 [2024-11-28 18:27:20] iteration       19/     500 | consumed samples:          152 | elapsed time per iteration (ms): 449170.3 | throughput per GPU (TFLOP/s/GPU): 115.9 | learning rate: 4.999178E-06 | global batch size:     8 | lm loss: 4.296840E+00 | loss scale: 1.0 | grad norm: 10.513 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
 [2024-11-28 18:34:39] iteration       20/     500 | consumed samples:          160 | elapsed time per iteration (ms): 438903.9 | throughput per GPU (TFLOP/s/GPU): 118.6 | learning rate: 4.998715E-06 | global batch size:     8 | lm loss: 4.165969E+00 | loss scale: 1.0 | grad norm: 8.266 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (107908.65, 107908.97)
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
 [2024-11-28 18:44:02] iteration       21/     500 | consumed samples:          168 | elapsed time per iteration (ms): 454286.3 | throughput per GPU (TFLOP/s/GPU): 114.6 | learning rate: 4.998150E-06 | global batch size:     8 | lm loss: 4.101646E+00 | loss scale: 1.0 | grad norm: 8.682 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1046489, 7826]
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1046489, 7826]
processed_samples 1300 unjoint_samples 1300 joint_samples 2 [1045990, 1007691]
processed_samples 1300 unjoint_samples 1300 joint_samples 2 [1045990, 1007691]
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [481372, 1047160]
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [481372, 1047160]
processed_samples 1300 unjoint_samples 1300 joint_samples 2 [1047713, 994410]
processed_samples 1300 unjoint_samples 1300 joint_samples 2 [1047713, 994410]
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1045557, 290992]
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1045557, 290992]
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [4593, 1047539]
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [275734, 1043972]
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [275734, 1043972]
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [4593, 1047539]
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1048104, 255593]
processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1048104, 255593]
 [2024-11-28 18:49:43] iteration       22/     500 | consumed samples:          176 | elapsed time per iteration (ms): 341280.8 | throughput per GPU (TFLOP/s/GPU): 152.5 | learning rate: 4.997482E-06 | global batch size:     8 | lm loss: 3.860502E+00 | loss scale: 1.0 | grad norm: 12.678 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
 [2024-11-28 18:56:32] iteration       23/     500 | consumed samples:          184 | elapsed time per iteration (ms): 408951.4 | throughput per GPU (TFLOP/s/GPU): 127.3 | learning rate: 4.996711E-06 | global batch size:     8 | lm loss: 3.689850E+00 | loss scale: 1.0 | grad norm: 8.323 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 19:02:55] iteration       24/     500 | consumed samples:          192 | elapsed time per iteration (ms): 383067.5 | throughput per GPU (TFLOP/s/GPU): 135.9 | learning rate: 4.995838E-06 | global batch size:     8 | lm loss: 3.747823E+00 | loss scale: 1.0 | grad norm: 8.168 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [176477, 1047743]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [232404, 1047554]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1046489, 339204]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [329617, 1047539]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [176477, 1047743]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1046489, 339204]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [232404, 1047554]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [612513, 1043972]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1045557, 589201]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [329617, 1047539]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [817074, 1047160]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [612513, 1043972]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1048104, 574654]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1048104, 574654]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1045557, 589201]
processed_samples 1400 unjoint_samples 1400 joint_samples 3 [817074, 1047160]
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b95e700] mmco: unref short failure
[h264 @ 0x56215b95e700] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
 [2024-11-28 19:13:38] iteration       25/     500 | consumed samples:          200 | elapsed time per iteration (ms): 643017.6 | throughput per GPU (TFLOP/s/GPU): 80.9 | learning rate: 4.994862E-06 | global batch size:     8 | lm loss: 3.567746E+00 | loss scale: 1.0 | grad norm: 7.384 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 1500 unjoint_samples 1500 joint_samples 4 [15562, 1047160]
processed_samples 1500 unjoint_samples 1500 joint_samples 4 [15562, 1047160]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [616066, 1047554]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1048104, 907324]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1046489, 623184]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1046489, 623184]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [616066, 1047554]
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [908565, 1043972]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1048104, 907324]
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [624037, 1047539]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [624037, 1047539]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [602378, 1047743]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1045557, 935157]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1045557, 935157]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [908565, 1043972]
processed_samples 1500 unjoint_samples 1500 joint_samples 3 [602378, 1047743]
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
 [2024-11-28 19:21:41] iteration       26/     500 | consumed samples:          208 | elapsed time per iteration (ms): 482923.1 | throughput per GPU (TFLOP/s/GPU): 107.8 | learning rate: 4.993783E-06 | global batch size:     8 | lm loss: 3.596097E+00 | loss scale: 1.0 | grad norm: 6.934 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
 [2024-11-28 19:28:44] iteration       27/     500 | consumed samples:          216 | elapsed time per iteration (ms): 422673.3 | throughput per GPU (TFLOP/s/GPU): 123.1 | learning rate: 4.992602E-06 | global batch size:     8 | lm loss: 3.274019E+00 | loss scale: 1.0 | grad norm: 6.573 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
 [2024-11-28 19:35:43] iteration       28/     500 | consumed samples:          224 | elapsed time per iteration (ms): 419757.8 | throughput per GPU (TFLOP/s/GPU): 124.0 | learning rate: 4.991319E-06 | global batch size:     8 | lm loss: 3.158726E+00 | loss scale: 1.0 | grad norm: 5.771 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
processed_samples 1600 unjoint_samples 1600 joint_samples 3 [1046489, 946723]
processed_samples 1600 unjoint_samples 1600 joint_samples 4 [114808, 1047945]
processed_samples 1600 unjoint_samples 1600 joint_samples 4 [114808, 1047945]
processed_samples 1600 unjoint_samples 1600 joint_samples 4 [286993, 1047160]
processed_samples 1600 unjoint_samples 1600 joint_samples 4 [285396, 1047119]
processed_samples 1600 unjoint_samples 1600 joint_samples 4 [286993, 1047160]
processed_samples 1600 unjoint_samples 1600 joint_samples 4 [155550, 1042992]
processed_samples 1600 unjoint_samples 1600 joint_samples 4 [285396, 1047119]
processed_samples 1600 unjoint_samples 1600 joint_samples 4 [155550, 1042992]
processed_samples 1600 unjoint_samples 1600 joint_samples 3 [1046489, 946723]
processed_samples 1600 unjoint_samples 1600 joint_samples 3 [927098, 1047743]
processed_samples 1600 unjoint_samples 1600 joint_samples 3 [1010244, 1047539]
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
processed_samples 1600 unjoint_samples 1600 joint_samples 3 [927098, 1047743]
processed_samples 1600 unjoint_samples 1600 joint_samples 3 [1010244, 1047539]
processed_samples 1600 unjoint_samples 1600 joint_samples 3 [917247, 1047554]
processed_samples 1600 unjoint_samples 1600 joint_samples 3 [917247, 1047554]
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
 [2024-11-28 19:43:34] iteration       29/     500 | consumed samples:          232 | elapsed time per iteration (ms): 470278.0 | throughput per GPU (TFLOP/s/GPU): 110.7 | learning rate: 4.989933E-06 | global batch size:     8 | lm loss: 3.036888E+00 | loss scale: 1.0 | grad norm: 16.115 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
 [2024-11-28 19:50:26] iteration       30/     500 | consumed samples:          240 | elapsed time per iteration (ms): 412673.2 | throughput per GPU (TFLOP/s/GPU): 126.1 | learning rate: 4.988444E-06 | global batch size:     8 | lm loss: 2.989427E+00 | loss scale: 1.0 | grad norm: 25.443 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
 [2024-11-28 19:56:45] iteration       31/     500 | consumed samples:          248 | elapsed time per iteration (ms): 378909.1 | throughput per GPU (TFLOP/s/GPU): 137.3 | learning rate: 4.986854E-06 | global batch size:     8 | lm loss: 2.768105E+00 | loss scale: 1.0 | grad norm: 12.204 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
 [2024-11-28 20:03:40] iteration       32/     500 | consumed samples:          256 | elapsed time per iteration (ms): 415089.0 | throughput per GPU (TFLOP/s/GPU): 125.4 | learning rate: 4.985161E-06 | global batch size:     8 | lm loss: 2.958685E+00 | loss scale: 1.0 | grad norm: 69.520 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [152708, 1047743]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [152708, 1047743]
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [1046327, 286245]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [1046327, 286245]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [227378, 1046063]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [227378, 1046063]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [369421, 1047945]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [369421, 1047945]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [1042972, 142693]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [1042972, 142693]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [644299, 1047119]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [570080, 1047160]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [435158, 1042992]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [435158, 1042992]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [644299, 1047119]
processed_samples 1700 unjoint_samples 1700 joint_samples 4 [570080, 1047160]
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d514f19000] mmco: unref short failure
[h264 @ 0x55d514f19000] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d514f19000] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c3b4780] mmco: unref short failure
[h264 @ 0x56215c3b4780] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215c3b4780] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [577867, 1046063]
[h264 @ 0x56215b9b7740] mmco: unref short failure
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [1042972, 527675]
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [429748, 1047743]
[h264 @ 0x55d5194f31c0] mmco: unref short failure
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [708231, 1042992]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [1046327, 547165]
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [627333, 1047945]
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [896495, 1047160]
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [871854, 1047119]
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [577867, 1046063]
[h264 @ 0x55d51407fb00] mmco: unref short failure
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [1042972, 527675]
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [429748, 1047743]
[h264 @ 0x56215f845e80] mmco: unref short failure
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [708231, 1042992]
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [896495, 1047160]
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [1046327, 547165]
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [627333, 1047945]
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
processed_samples 1800 unjoint_samples 1800 joint_samples 4 [871854, 1047119]
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d512ccf0c0] mmco: unref short failure
[h264 @ 0x55d512ccf0c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
 [2024-11-28 20:14:30] iteration       33/     500 | consumed samples:          264 | elapsed time per iteration (ms): 649242.9 | throughput per GPU (TFLOP/s/GPU): 80.2 | learning rate: 4.983366E-06 | global batch size:     8 | lm loss: 2.799433E+00 | loss scale: 1.0 | grad norm: 8.850 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 20:23:00] iteration       34/     500 | consumed samples:          272 | elapsed time per iteration (ms): 510659.3 | throughput per GPU (TFLOP/s/GPU): 101.9 | learning rate: 4.981468E-06 | global batch size:     8 | lm loss: 2.573955E+00 | loss scale: 1.0 | grad norm: 9.165 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215e8edc00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215e8edc00] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
processed_samples 1900 unjoint_samples 1900 joint_samples 5 [273341, 1047160]
processed_samples 1900 unjoint_samples 1900 joint_samples 5 [105106, 1047119]
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1042972, 818526]
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [743031, 1047743]
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1039483, 1042992]
[h264 @ 0x55d51723b300] mmco: unref short failure
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [862930, 1046063]
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1046327, 1006557]
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1031627, 1047945]
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
processed_samples 1900 unjoint_samples 1900 joint_samples 5 [273341, 1047160]
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [743031, 1047743]
processed_samples 1900 unjoint_samples 1900 joint_samples 5 [105106, 1047119]
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1042972, 818526]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [862930, 1046063]
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1039483, 1042992]
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1046327, 1006557]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1031627, 1047945]
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
 [2024-11-28 20:30:33] iteration       35/     500 | consumed samples:          280 | elapsed time per iteration (ms): 452777.2 | throughput per GPU (TFLOP/s/GPU): 114.9 | learning rate: 4.979469E-06 | global batch size:     8 | lm loss: 2.710365E+00 | loss scale: 1.0 | grad norm: 8.172 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 20:38:40] iteration       36/     500 | consumed samples:          288 | elapsed time per iteration (ms): 486962.5 | throughput per GPU (TFLOP/s/GPU): 106.9 | learning rate: 4.977368E-06 | global batch size:     8 | lm loss: 2.466654E+00 | loss scale: 1.0 | grad norm: 8.771 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
 [2024-11-28 20:45:53] iteration       37/     500 | consumed samples:          296 | elapsed time per iteration (ms): 433240.8 | throughput per GPU (TFLOP/s/GPU): 120.1 | learning rate: 4.975165E-06 | global batch size:     8 | lm loss: 2.457261E+00 | loss scale: 1.0 | grad norm: 6.649 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
 [2024-11-28 20:52:30] iteration       38/     500 | consumed samples:          304 | elapsed time per iteration (ms): 396727.3 | throughput per GPU (TFLOP/s/GPU): 131.2 | learning rate: 4.972860E-06 | global batch size:     8 | lm loss: 2.120668E+00 | loss scale: 1.0 | grad norm: 9.612 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
 [2024-11-28 20:59:15] iteration       39/     500 | consumed samples:          312 | elapsed time per iteration (ms): 404869.9 | throughput per GPU (TFLOP/s/GPU): 128.5 | learning rate: 4.970454E-06 | global batch size:     8 | lm loss: 2.032905E+00 | loss scale: 1.0 | grad norm: 6.617 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c19aa80] [h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1025880, 172231]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1045633, 404501]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1047005, 247576]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [614626, 1047160]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1046427, 219041]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [82963, 1044981]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [389277, 1047119]
processed_samples 2000 unjoint_samples 2000 joint_samples 4 [1025173, 1047743]
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1025880, 172231]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1045633, 404501]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1047005, 247576]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [82963, 1044981]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [614626, 1047160]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1046427, 219041]
processed_samples 2000 unjoint_samples 2000 joint_samples 5 [389277, 1047119]
processed_samples 2000 unjoint_samples 2000 joint_samples 4 [1025173, 1047743]
 [2024-11-28 21:07:02] iteration       40/     500 | consumed samples:          320 | elapsed time per iteration (ms): 467126.8 | throughput per GPU (TFLOP/s/GPU): 111.4 | learning rate: 4.967946E-06 | global batch size:     8 | lm loss: 2.002694E+00 | loss scale: 1.0 | grad norm: 8.476 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (230397.23, 230397.69)
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215e692c80] mmco: unref short failure
[h264 @ 0x56215e692c80] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [417457, 1044981]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1025880, 494031]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1029293, 416295]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1047005, 733854]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1046427, 573043]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1025880, 494031]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [417457, 1044981]
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1045633, 793613]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1029293, 416295]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [678301, 1047119]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [924889, 1047160]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1047005, 733854]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1046427, 573043]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [924889, 1047160]
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1045633, 793613]
processed_samples 2100 unjoint_samples 2100 joint_samples 5 [678301, 1047119]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
 [2024-11-28 21:20:28] iteration       41/     500 | consumed samples:          328 | elapsed time per iteration (ms): 575913.1 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 4.965337E-06 | global batch size:     8 | lm loss: 2.023730E+00 | loss scale: 1.0 | grad norm: 7.992 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
 [2024-11-28 21:27:41] iteration       42/     500 | consumed samples:          336 | elapsed time per iteration (ms): 432358.6 | throughput per GPU (TFLOP/s/GPU): 120.4 | learning rate: 4.962626E-06 | global batch size:     8 | lm loss: 1.950212E+00 | loss scale: 1.0 | grad norm: 10.135 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51325e000] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
 [2024-11-28 21:35:07] iteration       43/     500 | consumed samples:          344 | elapsed time per iteration (ms): 446286.7 | throughput per GPU (TFLOP/s/GPU): 116.6 | learning rate: 4.959814E-06 | global batch size:     8 | lm loss: 1.985991E+00 | loss scale: 1.0 | grad norm: 8.730 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 2200 unjoint_samples 2200 joint_samples 6 [64780, 1022454]
processed_samples 2200 unjoint_samples 2200 joint_samples 6 [153094, 1047160]
processed_samples 2200 unjoint_samples 2200 joint_samples 6 [153094, 1047160]
processed_samples 2200 unjoint_samples 2200 joint_samples 6 [64780, 1022454]
processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1029293, 760834]
processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1025880, 953414]
processed_samples 2200 unjoint_samples 2200 joint_samples 6 [17659, 1045609]
processed_samples 2200 unjoint_samples 2200 joint_samples 6 [17659, 1045609]
processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1046427, 997952]
processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1029293, 760834]
processed_samples 2200 unjoint_samples 2200 joint_samples 5 [680997, 1044981]
processed_samples 2200 unjoint_samples 2200 joint_samples 5 [680997, 1044981]
processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1025880, 953414]
processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1025348, 1047119]
processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1025348, 1047119]
processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1046427, 997952]
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
 [2024-11-28 21:42:48] iteration       44/     500 | consumed samples:          352 | elapsed time per iteration (ms): 460424.0 | throughput per GPU (TFLOP/s/GPU): 113.0 | learning rate: 4.956901E-06 | global batch size:     8 | lm loss: 1.866224E+00 | loss scale: 1.0 | grad norm: 7.184 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-28 21:49:34] iteration       45/     500 | consumed samples:          360 | elapsed time per iteration (ms): 406346.7 | throughput per GPU (TFLOP/s/GPU): 128.1 | learning rate: 4.953887E-06 | global batch size:     8 | lm loss: 1.858991E+00 | loss scale: 1.0 | grad norm: 10.926 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
 [2024-11-28 21:56:53] iteration       46/     500 | consumed samples:          368 | elapsed time per iteration (ms): 439465.9 | throughput per GPU (TFLOP/s/GPU): 118.4 | learning rate: 4.950772E-06 | global batch size:     8 | lm loss: 1.825678E+00 | loss scale: 1.0 | grad norm: 9.273 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [168460, 1021869]
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [418718, 1047160]
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [299051, 1046402]
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [328027, 1045609]
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [384015, 1022454]
processed_samples 2300 unjoint_samples 2300 joint_samples 5 [1029293, 999533]
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [1046773, 296268]
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [168460, 1021869]
processed_samples 2300 unjoint_samples 2300 joint_samples 5 [997435, 1044981]
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
processed_samples 2300 unjoint_samples 2300 joint_samples 5 [997435, 1044981]
[h264 @ 0x55d514594d00] mmco: unref short failure
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [299051, 1046402]
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [328027, 1045609]
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [1046773, 296268]
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [384015, 1022454]
processed_samples 2300 unjoint_samples 2300 joint_samples 6 [418718, 1047160]
processed_samples 2300 unjoint_samples 2300 joint_samples 5 [1029293, 999533]
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-11-28 22:04:56] iteration       47/     500 | consumed samples:          376 | elapsed time per iteration (ms): 482488.9 | throughput per GPU (TFLOP/s/GPU): 107.9 | learning rate: 4.947556E-06 | global batch size:     8 | lm loss: 1.775909E+00 | loss scale: 1.0 | grad norm: 10.026 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
 [2024-11-28 22:13:24] iteration       48/     500 | consumed samples:          384 | elapsed time per iteration (ms): 508396.6 | throughput per GPU (TFLOP/s/GPU): 102.4 | learning rate: 4.944240E-06 | global batch size:     8 | lm loss: 1.686098E+00 | loss scale: 1.0 | grad norm: 8.427 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [214219, 1046833]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [531637, 1021869]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [1041184, 189346]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [662814, 1045609]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [1046773, 620573]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [717427, 1022454]
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [562488, 1046402]
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [779201, 1047160]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [531637, 1021869]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [214219, 1046833]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [1041184, 189346]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [779201, 1047160]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [662814, 1045609]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [1046773, 620573]
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [562488, 1046402]
processed_samples 2400 unjoint_samples 2400 joint_samples 6 [717427, 1022454]
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
 [2024-11-28 22:22:49] iteration       49/     500 | consumed samples:          392 | elapsed time per iteration (ms): 565072.6 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 4.940823E-06 | global batch size:     8 | lm loss: 1.693457E+00 | loss scale: 1.0 | grad norm: 8.878 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
 [2024-11-28 22:33:36] iteration       50/     500 | consumed samples:          400 | elapsed time per iteration (ms): 646385.2 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 4.937306E-06 | global batch size:     8 | lm loss: 1.632946E+00 | loss scale: 1.0 | grad norm: 9.775 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [963034, 1045609]
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [1041184, 484451]
processed_samples 2500 unjoint_samples 2500 joint_samples 7 [1040035, 152091]
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [963034, 1045609]
processed_samples 2500 unjoint_samples 2500 joint_samples 7 [1040035, 152091]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [1041184, 484451]
processed_samples 2500 unjoint_samples 2500 joint_samples 7 [1037182, 27542]
processed_samples 2500 unjoint_samples 2500 joint_samples 7 [1037182, 27542]
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [898543, 1046402]
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [653431, 1046833]
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [653431, 1046833]
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [1046773, 1005163]
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [780576, 1021869]
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [780576, 1021869]
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [898543, 1046402]
processed_samples 2500 unjoint_samples 2500 joint_samples 6 [1046773, 1005163]
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
 [2024-11-28 22:43:43] iteration       51/     500 | consumed samples:          408 | elapsed time per iteration (ms): 606950.8 | throughput per GPU (TFLOP/s/GPU): 85.7 | learning rate: 4.933689E-06 | global batch size:     8 | lm loss: 1.538779E+00 | loss scale: 1.0 | grad norm: 6.345 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
 [2024-11-28 22:52:19] iteration       52/     500 | consumed samples:          416 | elapsed time per iteration (ms): 516548.8 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 4.929971E-06 | global batch size:     8 | lm loss: 1.636017E+00 | loss scale: 1.0 | grad norm: 6.369 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
 [2024-11-28 22:58:46] iteration       53/     500 | consumed samples:          424 | elapsed time per iteration (ms): 386918.3 | throughput per GPU (TFLOP/s/GPU): 134.5 | learning rate: 4.926154E-06 | global batch size:     8 | lm loss: 1.564707E+00 | loss scale: 1.0 | grad norm: 6.224 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
 [2024-11-28 23:06:07] iteration       54/     500 | consumed samples:          432 | elapsed time per iteration (ms): 441009.5 | throughput per GPU (TFLOP/s/GPU): 118.0 | learning rate: 4.922237E-06 | global batch size:     8 | lm loss: 1.536033E+00 | loss scale: 1.0 | grad norm: 14.530 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1030392, 45401]
[h264 @ 0x56215b75cc40] mmco: unref short failure
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1037182, 451076]
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1033539, 202907]
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1030392, 45401]
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1047495, 231869]
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [218226, 1046760]
processed_samples 2600 unjoint_samples 2600 joint_samples 6 [1041184, 749174]
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1040035, 418957]
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1037182, 451076]
processed_samples 2600 unjoint_samples 2600 joint_samples 6 [920232, 1046833]
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1033539, 202907]
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1047495, 231869]
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [218226, 1046760]
processed_samples 2600 unjoint_samples 2600 joint_samples 6 [1041184, 749174]
processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1040035, 418957]
processed_samples 2600 unjoint_samples 2600 joint_samples 6 [920232, 1046833]
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
 [2024-11-28 23:13:26] iteration       55/     500 | consumed samples:          440 | elapsed time per iteration (ms): 439133.8 | throughput per GPU (TFLOP/s/GPU): 118.5 | learning rate: 4.918221E-06 | global batch size:     8 | lm loss: 1.505980E+00 | loss scale: 1.0 | grad norm: 7.839 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
 [2024-11-28 23:21:09] iteration       56/     500 | consumed samples:          448 | elapsed time per iteration (ms): 463070.7 | throughput per GPU (TFLOP/s/GPU): 112.4 | learning rate: 4.914105E-06 | global batch size:     8 | lm loss: 1.489059E+00 | loss scale: 1.0 | grad norm: 5.188 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1030392, 346973]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [182480, 1023531]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [181982, 1046833]
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1047495, 577059]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1030392, 346973]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [182480, 1023531]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [181982, 1046833]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [501986, 1046760]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1047495, 577059]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [501986, 1046760]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1033539, 488905]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1037182, 785261]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1033539, 488905]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1040035, 635866]
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1040035, 635866]
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1037182, 785261]
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
 [2024-11-28 23:30:13] iteration       57/     500 | consumed samples:          456 | elapsed time per iteration (ms): 543396.8 | throughput per GPU (TFLOP/s/GPU): 95.8 | learning rate: 4.909890E-06 | global batch size:     8 | lm loss: 1.563977E+00 | loss scale: 1.0 | grad norm: 8.261 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [458308, 1046833]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [458308, 1046833]
processed_samples 2800 unjoint_samples 2800 joint_samples 8 [10513, 1046053]
processed_samples 2800 unjoint_samples 2800 joint_samples 8 [10513, 1046053]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1040035, 936194]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1040035, 936194]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1030392, 694809]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1030392, 694809]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [762415, 1046760]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [762415, 1046760]
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1047495, 905613]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1047495, 905613]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [542048, 1023531]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [542048, 1023531]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1033539, 903348]
processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1033539, 903348]
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
 [2024-11-28 23:38:26] iteration       58/     500 | consumed samples:          464 | elapsed time per iteration (ms): 493526.9 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 4.905577E-06 | global batch size:     8 | lm loss: 1.370810E+00 | loss scale: 1.0 | grad norm: 4.837 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
 [2024-11-28 23:48:14] iteration       59/     500 | consumed samples:          472 | elapsed time per iteration (ms): 587853.1 | throughput per GPU (TFLOP/s/GPU): 88.5 | learning rate: 4.901164E-06 | global batch size:     8 | lm loss: 1.455521E+00 | loss scale: 1.0 | grad norm: 3.914 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
 [2024-11-28 23:56:10] iteration       60/     500 | consumed samples:          480 | elapsed time per iteration (ms): 475241.1 | throughput per GPU (TFLOP/s/GPU): 109.5 | learning rate: 4.896652E-06 | global batch size:     8 | lm loss: 1.443850E+00 | loss scale: 1.0 | grad norm: 3.787 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (225408.87, 225409.30)
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
processed_samples 2900 unjoint_samples 2900 joint_samples 8 [183546, 1039187]
processed_samples 2900 unjoint_samples 2900 joint_samples 7 [912879, 1023531]
processed_samples 2900 unjoint_samples 2900 joint_samples 8 [192338, 1036659]
processed_samples 2900 unjoint_samples 2900 joint_samples 7 [1004522, 1046760]
processed_samples 2900 unjoint_samples 2900 joint_samples 8 [159690, 1030365]
processed_samples 2900 unjoint_samples 2900 joint_samples 7 [820339, 1046833]
processed_samples 2900 unjoint_samples 2900 joint_samples 8 [183546, 1039187]
processed_samples 2900 unjoint_samples 2900 joint_samples 8 [266009, 1046053]
processed_samples 2900 unjoint_samples 2900 joint_samples 7 [912879, 1023531]
processed_samples 2900 unjoint_samples 2900 joint_samples 7 [1030392, 1025591]
processed_samples 2900 unjoint_samples 2900 joint_samples 7 [820339, 1046833]
processed_samples 2900 unjoint_samples 2900 joint_samples 8 [192338, 1036659]
processed_samples 2900 unjoint_samples 2900 joint_samples 7 [1004522, 1046760]
processed_samples 2900 unjoint_samples 2900 joint_samples 8 [159690, 1030365]
processed_samples 2900 unjoint_samples 2900 joint_samples 7 [1030392, 1025591]
processed_samples 2900 unjoint_samples 2900 joint_samples 8 [266009, 1046053]
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
 [2024-11-29 00:07:26] iteration       61/     500 | consumed samples:          488 | elapsed time per iteration (ms): 450783.8 | throughput per GPU (TFLOP/s/GPU): 115.4 | learning rate: 4.892043E-06 | global batch size:     8 | lm loss: 1.438554E+00 | loss scale: 1.0 | grad norm: 11.352 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
 [2024-11-29 00:14:31] iteration       62/     500 | consumed samples:          496 | elapsed time per iteration (ms): 425191.4 | throughput per GPU (TFLOP/s/GPU): 122.4 | learning rate: 4.887334E-06 | global batch size:     8 | lm loss: 1.420748E+00 | loss scale: 1.0 | grad norm: 4.017 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
 [2024-11-29 00:22:50] iteration       63/     500 | consumed samples:          504 | elapsed time per iteration (ms): 498935.6 | throughput per GPU (TFLOP/s/GPU): 104.3 | learning rate: 4.882528E-06 | global batch size:     8 | lm loss: 1.325493E+00 | loss scale: 1.0 | grad norm: 2.921 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
 [2024-11-29 00:30:35] iteration       64/     500 | consumed samples:          512 | elapsed time per iteration (ms): 465537.3 | throughput per GPU (TFLOP/s/GPU): 111.8 | learning rate: 4.877624E-06 | global batch size:     8 | lm loss: 1.342766E+00 | loss scale: 1.0 | grad norm: 2.684 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d516ee2340] mmco: unref short failure
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [209876, 1047479]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1038574, 108667]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [467679, 1039187]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1043814, 238270]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [442152, 1030365]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [562463, 1046053]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1032263, 156247]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [579221, 1036659]
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1043814, 238270]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1038574, 108667]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [209876, 1047479]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1032263, 156247]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [442152, 1030365]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [467679, 1039187]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [562463, 1046053]
processed_samples 3000 unjoint_samples 3000 joint_samples 8 [579221, 1036659]
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d516b664c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d516b664c0] mmco: unref short failure
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1038574, 509673]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [470770, 1047479]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1032263, 503308]
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1043814, 597379]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [894532, 1036659]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [893361, 1046053]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [825304, 1039187]
[h264 @ 0x55d514dfd500] mmco: unref short failure
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [780811, 1030365]
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1038574, 509673]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [470770, 1047479]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1032263, 503308]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [825304, 1039187]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1043814, 597379]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [893361, 1046053]
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [780811, 1030365]
processed_samples 3100 unjoint_samples 3100 joint_samples 8 [894532, 1036659]
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
 [2024-11-29 00:41:20] iteration       65/     500 | consumed samples:          520 | elapsed time per iteration (ms): 644551.8 | throughput per GPU (TFLOP/s/GPU): 80.7 | learning rate: 4.872622E-06 | global batch size:     8 | lm loss: 1.464056E+00 | loss scale: 1.0 | grad norm: 3.641 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 00:49:29] iteration       66/     500 | consumed samples:          528 | elapsed time per iteration (ms): 489095.8 | throughput per GPU (TFLOP/s/GPU): 106.4 | learning rate: 4.867523E-06 | global batch size:     8 | lm loss: 1.302190E+00 | loss scale: 1.0 | grad norm: 3.139 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 00:58:04] iteration       67/     500 | consumed samples:          536 | elapsed time per iteration (ms): 514950.5 | throughput per GPU (TFLOP/s/GPU): 101.1 | learning rate: 4.862327E-06 | global batch size:     8 | lm loss: 1.271642E+00 | loss scale: 1.0 | grad norm: 4.161 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
 [2024-11-29 01:06:04] iteration       68/     500 | consumed samples:          544 | elapsed time per iteration (ms): 480233.0 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 4.857033E-06 | global batch size:     8 | lm loss: 1.318385E+00 | loss scale: 1.0 | grad norm: 3.545 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1046643, 153549]
processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1038574, 944139]
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1043814, 892001]
processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1033502, 182363]
processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1032263, 770555]
processed_samples 3200 unjoint_samples 3200 joint_samples 9 [154998, 1032569]
processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1047130, 373515]
processed_samples 3200 unjoint_samples 3200 joint_samples 8 [851493, 1047479]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1046643, 153549]
processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1038574, 944139]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1043814, 892001]
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1033502, 182363]
processed_samples 3200 unjoint_samples 3200 joint_samples 9 [154998, 1032569]
processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1032263, 770555]
processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1047130, 373515]
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
processed_samples 3200 unjoint_samples 3200 joint_samples 8 [851493, 1047479]
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
 [2024-11-29 01:14:21] iteration       69/     500 | consumed samples:          552 | elapsed time per iteration (ms): 496961.5 | throughput per GPU (TFLOP/s/GPU): 104.7 | learning rate: 4.851643E-06 | global batch size:     8 | lm loss: 1.259558E+00 | loss scale: 1.0 | grad norm: 2.552 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51a2c2780] mmco: unref short failure
[h264 @ 0x55d51a2c2780] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
 [2024-11-29 01:22:13] iteration       70/     500 | consumed samples:          560 | elapsed time per iteration (ms): 471204.7 | throughput per GPU (TFLOP/s/GPU): 110.4 | learning rate: 4.846156E-06 | global batch size:     8 | lm loss: 1.274312E+00 | loss scale: 1.0 | grad norm: 2.886 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
 [2024-11-29 01:29:27] iteration       71/     500 | consumed samples:          568 | elapsed time per iteration (ms): 434341.8 | throughput per GPU (TFLOP/s/GPU): 119.8 | learning rate: 4.840573E-06 | global batch size:     8 | lm loss: 1.292556E+00 | loss scale: 1.0 | grad norm: 4.098 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046506, 84077]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1044719, 190027]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046590, 224059]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [400124, 1032569]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1033502, 425149]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046643, 487770]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1047130, 726736]
processed_samples 3300 unjoint_samples 3300 joint_samples 8 [1044563, 1042250]
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046506, 84077]
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1044719, 190027]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046590, 224059]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1033502, 425149]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046643, 487770]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [400124, 1032569]
processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1047130, 726736]
processed_samples 3300 unjoint_samples 3300 joint_samples 8 [1044563, 1042250]
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
 [2024-11-29 01:38:44] iteration       72/     500 | consumed samples:          576 | elapsed time per iteration (ms): 556913.5 | throughput per GPU (TFLOP/s/GPU): 93.4 | learning rate: 4.834894E-06 | global batch size:     8 | lm loss: 1.166198E+00 | loss scale: 1.0 | grad norm: 2.673 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
 [2024-11-29 01:47:56] iteration       73/     500 | consumed samples:          584 | elapsed time per iteration (ms): 552456.3 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 4.829119E-06 | global batch size:     8 | lm loss: 1.214278E+00 | loss scale: 1.0 | grad norm: 2.226 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1033502, 756251]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1033502, 756251]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046590, 456265]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1044719, 601555]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1044719, 601555]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046742, 222638]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046590, 456265]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046506, 428089]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046742, 222638]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046506, 428089]
processed_samples 3400 unjoint_samples 3400 joint_samples 10 [655391, 993746]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046643, 969469]
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
processed_samples 3400 unjoint_samples 3400 joint_samples 10 [655391, 993746]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046643, 969469]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [679497, 1032569]
processed_samples 3400 unjoint_samples 3400 joint_samples 9 [679497, 1032569]
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
 [2024-11-29 01:56:04] iteration       74/     500 | consumed samples:          592 | elapsed time per iteration (ms): 487748.1 | throughput per GPU (TFLOP/s/GPU): 106.7 | learning rate: 4.823248E-06 | global batch size:     8 | lm loss: 1.274319E+00 | loss scale: 1.0 | grad norm: 3.009 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
 [2024-11-29 02:04:34] iteration       75/     500 | consumed samples:          600 | elapsed time per iteration (ms): 509804.4 | throughput per GPU (TFLOP/s/GPU): 102.1 | learning rate: 4.817282E-06 | global batch size:     8 | lm loss: 1.160900E+00 | loss scale: 1.0 | grad norm: 1.811 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1033502, 1044301]
processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046590, 765565]
processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1033502, 1044301]
processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046590, 765565]
processed_samples 3500 unjoint_samples 3500 joint_samples 10 [1047244, 10668]
processed_samples 3500 unjoint_samples 3500 joint_samples 10 [1047244, 10668]
processed_samples 3500 unjoint_samples 3500 joint_samples 10 [223735, 1047330]
processed_samples 3500 unjoint_samples 3500 joint_samples 10 [223735, 1047330]
processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046742, 545020]
processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046742, 545020]
processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046506, 752241]
processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046506, 752241]
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
processed_samples 3500 unjoint_samples 3500 joint_samples 10 [933465, 993746]
processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1034621, 1034829]
processed_samples 3500 unjoint_samples 3500 joint_samples 10 [933465, 993746]
processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1034621, 1034829]
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
 [2024-11-29 02:14:25] iteration       76/     500 | consumed samples:          608 | elapsed time per iteration (ms): 591255.1 | throughput per GPU (TFLOP/s/GPU): 88.0 | learning rate: 4.811221E-06 | global batch size:     8 | lm loss: 1.213377E+00 | loss scale: 1.0 | grad norm: 1.804 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
 [2024-11-29 02:21:37] iteration       77/     500 | consumed samples:          616 | elapsed time per iteration (ms): 431698.0 | throughput per GPU (TFLOP/s/GPU): 120.5 | learning rate: 4.805065E-06 | global batch size:     8 | lm loss: 1.169552E+00 | loss scale: 1.0 | grad norm: 2.096 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
 [2024-11-29 02:29:05] iteration       78/     500 | consumed samples:          624 | elapsed time per iteration (ms): 448587.5 | throughput per GPU (TFLOP/s/GPU): 116.0 | learning rate: 4.798814E-06 | global batch size:     8 | lm loss: 1.225845E+00 | loss scale: 1.0 | grad norm: 1.673 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
 [2024-11-29 02:37:14] iteration       79/     500 | consumed samples:          632 | elapsed time per iteration (ms): 488178.4 | throughput per GPU (TFLOP/s/GPU): 106.6 | learning rate: 4.792469E-06 | global batch size:     8 | lm loss: 1.162172E+00 | loss scale: 1.0 | grad norm: 1.446 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1046590, 15967]
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1046590, 15967]
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
processed_samples 3600 unjoint_samples 3600 joint_samples 10 [299263, 1045384]
processed_samples 3600 unjoint_samples 3600 joint_samples 11 [1037619, 181752]
processed_samples 3600 unjoint_samples 3600 joint_samples 10 [299263, 1045384]
processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1044335, 332165]
processed_samples 3600 unjoint_samples 3600 joint_samples 9 [1046506, 1042136]
processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1047244, 305151]
processed_samples 3600 unjoint_samples 3600 joint_samples 11 [1037619, 181752]
processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1044335, 332165]
processed_samples 3600 unjoint_samples 3600 joint_samples 9 [1046742, 769797]
processed_samples 3600 unjoint_samples 3600 joint_samples 9 [1046742, 769797]
processed_samples 3600 unjoint_samples 3600 joint_samples 9 [1046506, 1042136]
processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1047244, 305151]
processed_samples 3600 unjoint_samples 3600 joint_samples 10 [565551, 1047330]
processed_samples 3600 unjoint_samples 3600 joint_samples 10 [565551, 1047330]
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
 [2024-11-29 02:46:17] iteration       80/     500 | consumed samples:          640 | elapsed time per iteration (ms): 543840.7 | throughput per GPU (TFLOP/s/GPU): 95.7 | learning rate: 4.786030E-06 | global batch size:     8 | lm loss: 1.119773E+00 | loss scale: 1.0 | grad norm: 1.404 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (245412.40, 245413.07)
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [216677, 1044651]
[h264 @ 0x55d5145a1340] mmco: unref short failure
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1046590, 334641]
processed_samples 3700 unjoint_samples 3700 joint_samples 9 [1046742, 1013845]
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [608115, 1045384]
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1044335, 574437]
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1047244, 607285]
processed_samples 3700 unjoint_samples 3700 joint_samples 11 [1037619, 632873]
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1011196, 1047330]
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [216677, 1044651]
processed_samples 3700 unjoint_samples 3700 joint_samples 9 [1046742, 1013845]
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [608115, 1045384]
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1046590, 334641]
processed_samples 3700 unjoint_samples 3700 joint_samples 11 [1037619, 632873]
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1044335, 574437]
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1047244, 607285]
processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1011196, 1047330]
 [2024-11-29 03:00:11] iteration       81/     500 | consumed samples:          648 | elapsed time per iteration (ms): 587900.4 | throughput per GPU (TFLOP/s/GPU): 88.5 | learning rate: 4.779497E-06 | global batch size:     8 | lm loss: 1.176208E+00 | loss scale: 1.0 | grad norm: 1.442 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 03:06:57] iteration       82/     500 | consumed samples:          656 | elapsed time per iteration (ms): 406422.6 | throughput per GPU (TFLOP/s/GPU): 128.0 | learning rate: 4.772870E-06 | global batch size:     8 | lm loss: 1.140343E+00 | loss scale: 1.0 | grad norm: 1.479 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d514af6580] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1047244, 929345]
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1046790, 390463]
processed_samples 3800 unjoint_samples 3800 joint_samples 11 [242157, 1047330]
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1046590, 621579]
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1044335, 832002]
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [606617, 1044651]
[h264 @ 0x56215b17e600] mmco: unref short failure
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [910276, 1045384]
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
processed_samples 3800 unjoint_samples 3800 joint_samples 11 [1037619, 908765]
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1047244, 929345]
[h264 @ 0x55d4f080e440] mmco: unref short failure
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1046790, 390463]
processed_samples 3800 unjoint_samples 3800 joint_samples 11 [242157, 1047330]
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [606617, 1044651]
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1046590, 621579]
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1044335, 832002]
[h264 @ 0x55d514409540] mmco: unref short failure
processed_samples 3800 unjoint_samples 3800 joint_samples 10 [910276, 1045384]
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
processed_samples 3800 unjoint_samples 3800 joint_samples 11 [1037619, 908765]
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
 [2024-11-29 03:17:00] iteration       83/     500 | consumed samples:          664 | elapsed time per iteration (ms): 603008.3 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 4.766150E-06 | global batch size:     8 | lm loss: 1.124429E+00 | loss scale: 1.0 | grad norm: 1.974 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
 [2024-11-29 03:25:11] iteration       84/     500 | consumed samples:          672 | elapsed time per iteration (ms): 490715.5 | throughput per GPU (TFLOP/s/GPU): 106.1 | learning rate: 4.759337E-06 | global batch size:     8 | lm loss: 1.055463E+00 | loss scale: 1.0 | grad norm: 1.717 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
 [2024-11-29 03:33:25] iteration       85/     500 | consumed samples:          680 | elapsed time per iteration (ms): 494020.4 | throughput per GPU (TFLOP/s/GPU): 105.3 | learning rate: 4.752432E-06 | global batch size:     8 | lm loss: 1.164560E+00 | loss scale: 1.0 | grad norm: 2.110 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 03:40:54] iteration       86/     500 | consumed samples:          688 | elapsed time per iteration (ms): 448621.9 | throughput per GPU (TFLOP/s/GPU): 116.0 | learning rate: 4.745434E-06 | global batch size:     8 | lm loss: 1.063381E+00 | loss scale: 1.0 | grad norm: 1.811 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
processed_samples 3900 unjoint_samples 3900 joint_samples 11 [244902, 1046490]
processed_samples 3900 unjoint_samples 3900 joint_samples 11 [244902, 1046490]
processed_samples 3900 unjoint_samples 3900 joint_samples 10 [805089, 1044651]
processed_samples 3900 unjoint_samples 3900 joint_samples 10 [805089, 1044651]
processed_samples 3900 unjoint_samples 3900 joint_samples 10 [1046790, 707809]
processed_samples 3900 unjoint_samples 3900 joint_samples 10 [1046790, 707809]
processed_samples 3900 unjoint_samples 3900 joint_samples 12 [1042702, 181960]
processed_samples 3900 unjoint_samples 3900 joint_samples 12 [1042702, 181960]
processed_samples 3900 unjoint_samples 3900 joint_samples 11 [1045136, 195057]
processed_samples 3900 unjoint_samples 3900 joint_samples 11 [1045136, 195057]
processed_samples 3900 unjoint_samples 3900 joint_samples 11 [153527, 1039099]
processed_samples 3900 unjoint_samples 3900 joint_samples 11 [153527, 1039099]
processed_samples 3900 unjoint_samples 3900 joint_samples 11 [500248, 1047330]
processed_samples 3900 unjoint_samples 3900 joint_samples 11 [500248, 1047330]
processed_samples 3900 unjoint_samples 3900 joint_samples 10 [1046590, 967625]
processed_samples 3900 unjoint_samples 3900 joint_samples 10 [1046590, 967625]
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
 [2024-11-29 03:48:53] iteration       87/     500 | consumed samples:          696 | elapsed time per iteration (ms): 479164.2 | throughput per GPU (TFLOP/s/GPU): 108.6 | learning rate: 4.738344E-06 | global batch size:     8 | lm loss: 1.037563E+00 | loss scale: 1.0 | grad norm: 1.815 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
 [2024-11-29 03:57:24] iteration       88/     500 | consumed samples:          704 | elapsed time per iteration (ms): 510741.3 | throughput per GPU (TFLOP/s/GPU): 101.9 | learning rate: 4.731162E-06 | global batch size:     8 | lm loss: 1.042004E+00 | loss scale: 1.0 | grad norm: 1.369 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1047088, 6440]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1047088, 6440]
 [2024-11-29 04:06:50] iteration       89/     500 | consumed samples:          712 | elapsed time per iteration (ms): 566332.4 | throughput per GPU (TFLOP/s/GPU): 91.9 | learning rate: 4.723889E-06 | global batch size:     8 | lm loss: 1.054446E+00 | loss scale: 1.0 | grad norm: 1.186 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [830368, 1047330]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [830368, 1047330]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [46408, 1037817]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [46408, 1037817]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1045136, 461770]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [563123, 1046490]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1046590, 171890]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1045136, 461770]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1046590, 171890]
processed_samples 4000 unjoint_samples 4000 joint_samples 12 [1042702, 496263]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [563123, 1046490]
processed_samples 4000 unjoint_samples 4000 joint_samples 12 [1042702, 496263]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [458049, 1039099]
processed_samples 4000 unjoint_samples 4000 joint_samples 11 [458049, 1039099]
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
 [2024-11-29 04:14:27] iteration       90/     500 | consumed samples:          720 | elapsed time per iteration (ms): 457008.2 | throughput per GPU (TFLOP/s/GPU): 113.9 | learning rate: 4.716524E-06 | global batch size:     8 | lm loss: 1.043577E+00 | loss scale: 1.0 | grad norm: 1.171 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1047088, 285789]
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [409121, 1037817]
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1046590, 563874]
processed_samples 4100 unjoint_samples 4100 joint_samples 12 [993319, 204450]
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [777555, 1039099]
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [824691, 1046490]
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1045136, 727958]
processed_samples 4100 unjoint_samples 4100 joint_samples 12 [1042702, 905179]
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1046590, 563874]
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1047088, 285789]
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [409121, 1037817]
processed_samples 4100 unjoint_samples 4100 joint_samples 12 [993319, 204450]
processed_samples 4100 unjoint_samples 4100 joint_samples 12 [1042702, 905179]
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [777555, 1039099]
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [824691, 1046490]
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1045136, 727958]
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
 [2024-11-29 04:24:04] iteration       91/     500 | consumed samples:          728 | elapsed time per iteration (ms): 576893.4 | throughput per GPU (TFLOP/s/GPU): 90.2 | learning rate: 4.709068E-06 | global batch size:     8 | lm loss: 9.959705E-01 | loss scale: 1.0 | grad norm: 1.052 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 04:31:49] iteration       92/     500 | consumed samples:          736 | elapsed time per iteration (ms): 464881.9 | throughput per GPU (TFLOP/s/GPU): 111.9 | learning rate: 4.701522E-06 | global batch size:     8 | lm loss: 1.006524E+00 | loss scale: 1.0 | grad norm: 1.068 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
 [2024-11-29 04:39:19] iteration       93/     500 | consumed samples:          744 | elapsed time per iteration (ms): 450545.2 | throughput per GPU (TFLOP/s/GPU): 115.5 | learning rate: 4.693886E-06 | global batch size:     8 | lm loss: 9.836991E-01 | loss scale: 1.0 | grad norm: 1.314 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
 [2024-11-29 04:48:00] iteration       94/     500 | consumed samples:          752 | elapsed time per iteration (ms): 520814.7 | throughput per GPU (TFLOP/s/GPU): 99.9 | learning rate: 4.686160E-06 | global batch size:     8 | lm loss: 1.038640E+00 | loss scale: 1.0 | grad norm: 1.123 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 4200 unjoint_samples 4200 joint_samples 11 [1047088, 636316]
processed_samples 4200 unjoint_samples 4200 joint_samples 13 [1046693, 107270]
processed_samples 4200 unjoint_samples 4200 joint_samples 12 [83138, 1010857]
processed_samples 4200 unjoint_samples 4200 joint_samples 12 [981983, 307130]
processed_samples 4200 unjoint_samples 4200 joint_samples 12 [37300, 1046490]
processed_samples 4200 unjoint_samples 4200 joint_samples 11 [1046590, 865156]
processed_samples 4200 unjoint_samples 4200 joint_samples 11 [734402, 1037817]
processed_samples 4200 unjoint_samples 4200 joint_samples 12 [993319, 475002]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
processed_samples 4200 unjoint_samples 4200 joint_samples 11 [1047088, 636316]
processed_samples 4200 unjoint_samples 4200 joint_samples 13 [1046693, 107270]
processed_samples 4200 unjoint_samples 4200 joint_samples 12 [83138, 1010857]
processed_samples 4200 unjoint_samples 4200 joint_samples 12 [37300, 1046490]
processed_samples 4200 unjoint_samples 4200 joint_samples 12 [981983, 307130]
processed_samples 4200 unjoint_samples 4200 joint_samples 11 [1046590, 865156]
processed_samples 4200 unjoint_samples 4200 joint_samples 11 [734402, 1037817]
processed_samples 4200 unjoint_samples 4200 joint_samples 12 [993319, 475002]
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
 [2024-11-29 04:56:25] iteration       95/     500 | consumed samples:          760 | elapsed time per iteration (ms): 504330.7 | throughput per GPU (TFLOP/s/GPU): 103.2 | learning rate: 4.678344E-06 | global batch size:     8 | lm loss: 1.033153E+00 | loss scale: 1.0 | grad norm: 1.444 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215e692c80] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
processed_samples 4300 unjoint_samples 4300 joint_samples 11 [1047088, 890480]
processed_samples 4300 unjoint_samples 4300 joint_samples 12 [981983, 615910]
processed_samples 4300 unjoint_samples 4300 joint_samples 12 [102485, 1046252]
processed_samples 4300 unjoint_samples 4300 joint_samples 13 [1046693, 415812]
processed_samples 4300 unjoint_samples 4300 joint_samples 11 [1047088, 890480]
processed_samples 4300 unjoint_samples 4300 joint_samples 12 [102485, 1046252]
processed_samples 4300 unjoint_samples 4300 joint_samples 12 [406857, 1046490]
processed_samples 4300 unjoint_samples 4300 joint_samples 12 [412630, 1010857]
processed_samples 4300 unjoint_samples 4300 joint_samples 12 [981983, 615910]
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
processed_samples 4300 unjoint_samples 4300 joint_samples 12 [412630, 1010857]
processed_samples 4300 unjoint_samples 4300 joint_samples 12 [406857, 1046490]
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
processed_samples 4300 unjoint_samples 4300 joint_samples 12 [993319, 788105]
processed_samples 4300 unjoint_samples 4300 joint_samples 13 [1046693, 415812]
processed_samples 4300 unjoint_samples 4300 joint_samples 11 [1011278, 1037817]
[h264 @ 0x55d514afd580] mmco: unref short failure
processed_samples 4300 unjoint_samples 4300 joint_samples 11 [1011278, 1037817]
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
processed_samples 4300 unjoint_samples 4300 joint_samples 12 [993319, 788105]
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
 [2024-11-29 05:05:55] iteration       96/     500 | consumed samples:          768 | elapsed time per iteration (ms): 570433.4 | throughput per GPU (TFLOP/s/GPU): 91.2 | learning rate: 4.670439E-06 | global batch size:     8 | lm loss: 1.019382E+00 | loss scale: 1.0 | grad norm: 1.070 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
 [2024-11-29 05:14:18] iteration       97/     500 | consumed samples:          776 | elapsed time per iteration (ms): 502991.6 | throughput per GPU (TFLOP/s/GPU): 103.5 | learning rate: 4.662444E-06 | global batch size:     8 | lm loss: 1.011606E+00 | loss scale: 1.0 | grad norm: 1.330 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51396a480] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
 [2024-11-29 05:22:22] iteration       98/     500 | consumed samples:          784 | elapsed time per iteration (ms): 484093.2 | throughput per GPU (TFLOP/s/GPU): 107.5 | learning rate: 4.654361E-06 | global batch size:     8 | lm loss: 1.043122E+00 | loss scale: 1.0 | grad norm: 1.471 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [1047088, 109545]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [1047088, 109545]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [520419, 1046252]
processed_samples 4400 unjoint_samples 4400 joint_samples 13 [1042102, 33067]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [520419, 1046252]
processed_samples 4400 unjoint_samples 4400 joint_samples 13 [1042102, 33067]
processed_samples 4400 unjoint_samples 4400 joint_samples 13 [1046693, 688866]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [768511, 1046490]
processed_samples 4400 unjoint_samples 4400 joint_samples 13 [1046693, 688866]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [1041692, 359584]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [1041692, 359584]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [981983, 886177]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [743694, 1010857]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [743694, 1010857]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [768511, 1046490]
processed_samples 4400 unjoint_samples 4400 joint_samples 12 [981983, 886177]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51396a480] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
 [2024-11-29 05:31:38] iteration       99/     500 | consumed samples:          792 | elapsed time per iteration (ms): 556395.2 | throughput per GPU (TFLOP/s/GPU): 93.5 | learning rate: 4.646190E-06 | global batch size:     8 | lm loss: 9.925530E-01 | loss scale: 1.0 | grad norm: 1.482 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
 [2024-11-29 05:39:47] iteration      100/     500 | consumed samples:          800 | elapsed time per iteration (ms): 488373.0 | throughput per GPU (TFLOP/s/GPU): 106.6 | learning rate: 4.637931E-06 | global batch size:     8 | lm loss: 9.455621E-01 | loss scale: 1.0 | grad norm: 0.972 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (248965.82, 248966.16)
processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1036673, 41529]
processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1036673, 41529]
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1047088, 454637]
processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1047088, 454637]
processed_samples 4500 unjoint_samples 4500 joint_samples 13 [10701, 1046490]
processed_samples 4500 unjoint_samples 4500 joint_samples 13 [10701, 1046490]
processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1042102, 270411]
processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1042102, 270411]
processed_samples 4500 unjoint_samples 4500 joint_samples 12 [834087, 1046252]
processed_samples 4500 unjoint_samples 4500 joint_samples 12 [834087, 1046252]
processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1041692, 612687]
processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1041692, 612687]
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1046693, 1013490]
processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1038438, 1038449]
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1038438, 1038449]
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1046693, 1013490]
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
 [2024-11-29 05:51:18] iteration      101/     500 | consumed samples:          808 | elapsed time per iteration (ms): 442254.7 | throughput per GPU (TFLOP/s/GPU): 117.7 | learning rate: 4.629585E-06 | global batch size:     8 | lm loss: 1.065639E+00 | loss scale: 1.0 | grad norm: 1.389 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51396a480] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
 [2024-11-29 05:59:08] iteration      102/     500 | consumed samples:          816 | elapsed time per iteration (ms): 470143.8 | throughput per GPU (TFLOP/s/GPU): 110.7 | learning rate: 4.621151E-06 | global batch size:     8 | lm loss: 1.034651E+00 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
 [2024-11-29 06:06:19] iteration      103/     500 | consumed samples:          824 | elapsed time per iteration (ms): 431079.3 | throughput per GPU (TFLOP/s/GPU): 120.7 | learning rate: 4.612630E-06 | global batch size:     8 | lm loss: 1.042905E+00 | loss scale: 1.0 | grad norm: 1.657 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1036673, 353785]
[h264 @ 0x562163b30a80] mmco: unref short failure
processed_samples 4600 unjoint_samples 4600 joint_samples 12 [1047088, 687956]
processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1042766, 104271]
processed_samples 4600 unjoint_samples 4600 joint_samples 14 [311681, 1031759]
processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1042102, 534994]
processed_samples 4600 unjoint_samples 4600 joint_samples 13 [360930, 1045107]
processed_samples 4600 unjoint_samples 4600 joint_samples 12 [1041692, 896915]
[h264 @ 0x56215babf100] mmco: unref short failure
processed_samples 4600 unjoint_samples 4600 joint_samples 13 [257614, 1046490]
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1036673, 353785]
[h264 @ 0x55d5137029c0] mmco: unref short failure
processed_samples 4600 unjoint_samples 4600 joint_samples 12 [1047088, 687956]
processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1042766, 104271]
processed_samples 4600 unjoint_samples 4600 joint_samples 13 [257614, 1046490]
processed_samples 4600 unjoint_samples 4600 joint_samples 13 [360930, 1045107]
processed_samples 4600 unjoint_samples 4600 joint_samples 14 [311681, 1031759]
processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1042102, 534994]
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
processed_samples 4600 unjoint_samples 4600 joint_samples 12 [1041692, 896915]
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5135e2680] mmco: unref short failure
[h264 @ 0x55d5135e2680] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
 [2024-11-29 06:15:23] iteration      104/     500 | consumed samples:          832 | elapsed time per iteration (ms): 543203.7 | throughput per GPU (TFLOP/s/GPU): 95.8 | learning rate: 4.604022E-06 | global batch size:     8 | lm loss: 1.031887E+00 | loss scale: 1.0 | grad norm: 0.929 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [62227, 1003115]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [62227, 1003115]
 [2024-11-29 06:25:18] iteration      105/     500 | consumed samples:          840 | elapsed time per iteration (ms): 595132.4 | throughput per GPU (TFLOP/s/GPU): 87.4 | learning rate: 4.595329E-06 | global batch size:     8 | lm loss: 1.009381E+00 | loss scale: 1.0 | grad norm: 1.249 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [157102, 1036574]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [157102, 1036574]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1042766, 381816]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [515158, 1046490]
processed_samples 4700 unjoint_samples 4700 joint_samples 14 [612607, 1031759]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1042102, 837593]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1042102, 837593]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1042766, 381816]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1036673, 732799]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [576384, 1045107]
processed_samples 4700 unjoint_samples 4700 joint_samples 14 [612607, 1031759]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1036673, 732799]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [576384, 1045107]
processed_samples 4700 unjoint_samples 4700 joint_samples 13 [515158, 1046490]
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
 [2024-11-29 06:34:43] iteration      106/     500 | consumed samples:          848 | elapsed time per iteration (ms): 565474.5 | throughput per GPU (TFLOP/s/GPU): 92.0 | learning rate: 4.586549E-06 | global batch size:     8 | lm loss: 1.035590E+00 | loss scale: 1.0 | grad norm: 1.114 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [842164, 1046490]
processed_samples 4800 unjoint_samples 4800 joint_samples 14 [126876, 1033289]
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [374110, 1036574]
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [1036673, 997318]
processed_samples 4800 unjoint_samples 4800 joint_samples 14 [947205, 1031759]
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [409213, 1003115]
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [1042766, 729396]
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [915955, 1045107]
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [409213, 1003115]
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [374110, 1036574]
processed_samples 4800 unjoint_samples 4800 joint_samples 14 [126876, 1033289]
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [1042766, 729396]
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [1036673, 997318]
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [842164, 1046490]
processed_samples 4800 unjoint_samples 4800 joint_samples 14 [947205, 1031759]
processed_samples 4800 unjoint_samples 4800 joint_samples 13 [915955, 1045107]
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
 [2024-11-29 06:42:00] iteration      107/     500 | consumed samples:          856 | elapsed time per iteration (ms): 436982.9 | throughput per GPU (TFLOP/s/GPU): 119.1 | learning rate: 4.577684E-06 | global batch size:     8 | lm loss: 9.436591E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
 [2024-11-29 06:51:01] iteration      108/     500 | consumed samples:          864 | elapsed time per iteration (ms): 540830.1 | throughput per GPU (TFLOP/s/GPU): 96.2 | learning rate: 4.568735E-06 | global batch size:     8 | lm loss: 9.739519E-01 | loss scale: 1.0 | grad norm: 0.957 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
 [2024-11-29 06:58:44] iteration      109/     500 | consumed samples:          872 | elapsed time per iteration (ms): 462580.7 | throughput per GPU (TFLOP/s/GPU): 112.5 | learning rate: 4.559700E-06 | global batch size:     8 | lm loss: 1.027283E+00 | loss scale: 1.0 | grad norm: 0.968 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
 [2024-11-29 07:06:40] iteration      110/     500 | consumed samples:          880 | elapsed time per iteration (ms): 476241.8 | throughput per GPU (TFLOP/s/GPU): 109.3 | learning rate: 4.550581E-06 | global batch size:     8 | lm loss: 9.752579E-01 | loss scale: 1.0 | grad norm: 1.212 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
processed_samples 4900 unjoint_samples 4900 joint_samples 13 [682640, 1003115]
[h264 @ 0x56215b7c4500] mmco: unref short failure
processed_samples 4900 unjoint_samples 4900 joint_samples 13 [613951, 1036574]
processed_samples 4900 unjoint_samples 4900 joint_samples 14 [59990, 1047184]
processed_samples 4900 unjoint_samples 4900 joint_samples 15 [187164, 1039417]
processed_samples 4900 unjoint_samples 4900 joint_samples 14 [387453, 1033289]
processed_samples 4900 unjoint_samples 4900 joint_samples 14 [1036673, 308324]
[h264 @ 0x56215bb86100] mmco: unref short failure
processed_samples 4900 unjoint_samples 4900 joint_samples 14 [258805, 1047319]
processed_samples 4900 unjoint_samples 4900 joint_samples 13 [1042766, 1022708]
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
processed_samples 4900 unjoint_samples 4900 joint_samples 13 [682640, 1003115]
processed_samples 4900 unjoint_samples 4900 joint_samples 14 [59990, 1047184]
processed_samples 4900 unjoint_samples 4900 joint_samples 15 [187164, 1039417]
processed_samples 4900 unjoint_samples 4900 joint_samples 14 [1036673, 308324]
processed_samples 4900 unjoint_samples 4900 joint_samples 14 [387453, 1033289]
processed_samples 4900 unjoint_samples 4900 joint_samples 13 [613951, 1036574]
processed_samples 4900 unjoint_samples 4900 joint_samples 13 [1042766, 1022708]
processed_samples 4900 unjoint_samples 4900 joint_samples 14 [258805, 1047319]
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
 [2024-11-29 07:13:43] iteration      111/     500 | consumed samples:          888 | elapsed time per iteration (ms): 423204.8 | throughput per GPU (TFLOP/s/GPU): 123.0 | learning rate: 4.541378E-06 | global batch size:     8 | lm loss: 9.781538E-01 | loss scale: 1.0 | grad norm: 1.067 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215e692c80] mmco: unref short failure
[h264 @ 0x56215e692c80] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
processed_samples 5000 unjoint_samples 5000 joint_samples 14 [242265, 1045059]
processed_samples 5000 unjoint_samples 5000 joint_samples 14 [430574, 1047184]
processed_samples 5000 unjoint_samples 5000 joint_samples 14 [1036673, 661373]
processed_samples 5000 unjoint_samples 5000 joint_samples 14 [567324, 1047319]
[h264 @ 0x56215f845e80] mmco: unref short failure
processed_samples 5000 unjoint_samples 5000 joint_samples 15 [526163, 1039417]
processed_samples 5000 unjoint_samples 5000 joint_samples 13 [993636, 1003115]
processed_samples 5000 unjoint_samples 5000 joint_samples 14 [639696, 1033289]
processed_samples 5000 unjoint_samples 5000 joint_samples 13 [980516, 1036574]
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 5000 unjoint_samples 5000 joint_samples 14 [242265, 1045059]
processed_samples 5000 unjoint_samples 5000 joint_samples 14 [430574, 1047184]
processed_samples 5000 unjoint_samples 5000 joint_samples 15 [526163, 1039417]
processed_samples 5000 unjoint_samples 5000 joint_samples 14 [1036673, 661373]
processed_samples 5000 unjoint_samples 5000 joint_samples 14 [639696, 1033289]
processed_samples 5000 unjoint_samples 5000 joint_samples 14 [567324, 1047319]
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
processed_samples 5000 unjoint_samples 5000 joint_samples 13 [993636, 1003115]
processed_samples 5000 unjoint_samples 5000 joint_samples 13 [980516, 1036574]
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
 [2024-11-29 07:23:04] iteration      112/     500 | consumed samples:          896 | elapsed time per iteration (ms): 560984.6 | throughput per GPU (TFLOP/s/GPU): 92.8 | learning rate: 4.532092E-06 | global batch size:     8 | lm loss: 9.570374E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
 [2024-11-29 07:32:45] iteration      113/     500 | consumed samples:          904 | elapsed time per iteration (ms): 581159.7 | throughput per GPU (TFLOP/s/GPU): 89.5 | learning rate: 4.522722E-06 | global batch size:     8 | lm loss: 9.289886E-01 | loss scale: 1.0 | grad norm: 0.897 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215c3b4780] mmco: unref short failure
[h264 @ 0x56215c3b4780] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215cb38d40] mmco: unref short failure
[h264 @ 0x56215cb38d40] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [141801, 1046799]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [1007557, 381378]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [626624, 1045059]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [799334, 1047184]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [931628, 1033289]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [141801, 1046799]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [1007557, 381378]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [626624, 1045059]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [799334, 1047184]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [931628, 1033289]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [898833, 1047319]
[h264 @ 0x55d513a1f200] mmco: unref short failure
processed_samples 5100 unjoint_samples 5100 joint_samples 15 [925759, 1039417]
processed_samples 5100 unjoint_samples 5100 joint_samples 15 [925759, 1039417]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [898833, 1047319]
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [1036673, 1012305]
[h264 @ 0x562160944a00] mmco: unref short failure
processed_samples 5100 unjoint_samples 5100 joint_samples 14 [1036673, 1012305]
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
 [2024-11-29 07:41:28] iteration      114/     500 | consumed samples:          912 | elapsed time per iteration (ms): 522621.9 | throughput per GPU (TFLOP/s/GPU): 99.6 | learning rate: 4.513270E-06 | global batch size:     8 | lm loss: 9.133291E-01 | loss scale: 1.0 | grad norm: 0.891 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
 [2024-11-29 07:49:33] iteration      115/     500 | consumed samples:          920 | elapsed time per iteration (ms): 485532.1 | throughput per GPU (TFLOP/s/GPU): 107.2 | learning rate: 4.503735E-06 | global batch size:     8 | lm loss: 9.276041E-01 | loss scale: 1.0 | grad norm: 0.883 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-11-29 07:59:32] iteration      116/     500 | consumed samples:          928 | elapsed time per iteration (ms): 598337.5 | throughput per GPU (TFLOP/s/GPU): 87.0 | learning rate: 4.494118E-06 | global batch size:     8 | lm loss: 9.630095E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
 [2024-11-29 08:08:05] iteration      117/     500 | consumed samples:          936 | elapsed time per iteration (ms): 513474.3 | throughput per GPU (TFLOP/s/GPU): 101.4 | learning rate: 4.484420E-06 | global batch size:     8 | lm loss: 9.400152E-01 | loss scale: 1.0 | grad norm: 0.982 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
processed_samples 5200 unjoint_samples 5200 joint_samples 15 [1045834, 131862]
processed_samples 5200 unjoint_samples 5200 joint_samples 16 [156295, 1046736]
processed_samples 5200 unjoint_samples 5200 joint_samples 14 [628817, 1046799]
processed_samples 5200 unjoint_samples 5200 joint_samples 15 [1023091, 282688]
processed_samples 5200 unjoint_samples 5200 joint_samples 15 [350115, 1046837]
processed_samples 5200 unjoint_samples 5200 joint_samples 14 [1007557, 620477]
processed_samples 5200 unjoint_samples 5200 joint_samples 14 [1029032, 1047184]
processed_samples 5200 unjoint_samples 5200 joint_samples 14 [984714, 1045059]
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
processed_samples 5200 unjoint_samples 5200 joint_samples 14 [628817, 1046799]
processed_samples 5200 unjoint_samples 5200 joint_samples 14 [1007557, 620477]
processed_samples 5200 unjoint_samples 5200 joint_samples 16 [156295, 1046736]
processed_samples 5200 unjoint_samples 5200 joint_samples 15 [1045834, 131862]
processed_samples 5200 unjoint_samples 5200 joint_samples 15 [350115, 1046837]
processed_samples 5200 unjoint_samples 5200 joint_samples 15 [1023091, 282688]
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 5200 unjoint_samples 5200 joint_samples 14 [1029032, 1047184]
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
processed_samples 5200 unjoint_samples 5200 joint_samples 14 [984714, 1045059]
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
 [2024-11-29 08:16:09] iteration      118/     500 | consumed samples:          944 | elapsed time per iteration (ms): 483708.4 | throughput per GPU (TFLOP/s/GPU): 107.6 | learning rate: 4.474640E-06 | global batch size:     8 | lm loss: 9.409871E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
 [2024-11-29 08:24:50] iteration      119/     500 | consumed samples:          952 | elapsed time per iteration (ms): 521429.1 | throughput per GPU (TFLOP/s/GPU): 99.8 | learning rate: 4.464780E-06 | global batch size:     8 | lm loss: 9.223815E-01 | loss scale: 1.0 | grad norm: 6.079 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1038128, 261470]
processed_samples 5300 unjoint_samples 5300 joint_samples 15 [377930, 1047184]
processed_samples 5300 unjoint_samples 5300 joint_samples 15 [699267, 1046837]
processed_samples 5300 unjoint_samples 5300 joint_samples 16 [472923, 1046736]
processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1023091, 546262]
processed_samples 5300 unjoint_samples 5300 joint_samples 14 [1007557, 972932]
processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1045834, 474081]
processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1038128, 261470]
processed_samples 5300 unjoint_samples 5300 joint_samples 16 [472923, 1046736]
processed_samples 5300 unjoint_samples 5300 joint_samples 15 [377930, 1047184]
processed_samples 5300 unjoint_samples 5300 joint_samples 14 [928214, 1046799]
processed_samples 5300 unjoint_samples 5300 joint_samples 14 [928214, 1046799]
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
processed_samples 5300 unjoint_samples 5300 joint_samples 15 [699267, 1046837]
processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1045834, 474081]
processed_samples 5300 unjoint_samples 5300 joint_samples 14 [1007557, 972932]
processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1023091, 546262]
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
 [2024-11-29 08:34:56] iteration      120/     500 | consumed samples:          960 | elapsed time per iteration (ms): 605678.0 | throughput per GPU (TFLOP/s/GPU): 85.9 | learning rate: 4.454840E-06 | global batch size:     8 | lm loss: 9.326912E-01 | loss scale: 1.0 | grad norm: 1.200 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (259183.47, 259183.93)
 [2024-11-29 08:47:16] iteration      121/     500 | consumed samples:          968 | elapsed time per iteration (ms): 480180.6 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 4.444819E-06 | global batch size:     8 | lm loss: 9.741026E-01 | loss scale: 1.0 | grad norm: 0.813 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1046338, 293973]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [119916, 1047023]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1038128, 508328]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1046338, 293973]
processed_samples 5400 unjoint_samples 5400 joint_samples 16 [726574, 1046736]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [119916, 1047023]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1038128, 508328]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [703098, 1047184]
processed_samples 5400 unjoint_samples 5400 joint_samples 16 [726574, 1046736]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1023091, 800941]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1023091, 800941]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [703098, 1047184]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1022332, 1046837]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1022332, 1046837]
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1045834, 755518]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1045834, 755518]
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
 [2024-11-29 08:55:41] iteration      122/     500 | consumed samples:          976 | elapsed time per iteration (ms): 505286.7 | throughput per GPU (TFLOP/s/GPU): 103.0 | learning rate: 4.434719E-06 | global batch size:     8 | lm loss: 9.632880E-01 | loss scale: 1.0 | grad norm: 0.791 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
 [2024-11-29 09:04:05] iteration      123/     500 | consumed samples:          984 | elapsed time per iteration (ms): 504057.0 | throughput per GPU (TFLOP/s/GPU): 103.2 | learning rate: 4.424540E-06 | global batch size:     8 | lm loss: 9.055476E-01 | loss scale: 1.0 | grad norm: 1.148 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 5500 unjoint_samples 5500 joint_samples 15 [349602, 1047023]
processed_samples 5500 unjoint_samples 5500 joint_samples 15 [1046338, 531746]
processed_samples 5500 unjoint_samples 5500 joint_samples 15 [1038128, 841101]
[h264 @ 0x55d514afd580] mmco: unref short failure
processed_samples 5500 unjoint_samples 5500 joint_samples 16 [31641, 1029588]
processed_samples 5500 unjoint_samples 5500 joint_samples 15 [1046338, 531746]
processed_samples 5500 unjoint_samples 5500 joint_samples 15 [349602, 1047023]
processed_samples 5500 unjoint_samples 5500 joint_samples 15 [1038128, 841101]
processed_samples 5500 unjoint_samples 5500 joint_samples 16 [31641, 1029588]
 [2024-11-29 09:14:16] iteration      124/     500 | consumed samples:          992 | elapsed time per iteration (ms): 611534.8 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 4.414282E-06 | global batch size:     8 | lm loss: 9.379023E-01 | loss scale: 1.0 | grad norm: 0.838 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 5500 unjoint_samples 5500 joint_samples 16 [366600, 1046889]
processed_samples 5500 unjoint_samples 5500 joint_samples 15 [952606, 1047184]
processed_samples 5500 unjoint_samples 5500 joint_samples 16 [366600, 1046889]
processed_samples 5500 unjoint_samples 5500 joint_samples 15 [952606, 1047184]
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
processed_samples 5500 unjoint_samples 5500 joint_samples 17 [977175, 102313]
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
processed_samples 5500 unjoint_samples 5500 joint_samples 17 [977175, 102313]
processed_samples 5500 unjoint_samples 5500 joint_samples 16 [1045883, 77171]
processed_samples 5500 unjoint_samples 5500 joint_samples 16 [1045883, 77171]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
 [2024-11-29 09:21:13] iteration      125/     500 | consumed samples:         1000 | elapsed time per iteration (ms): 416906.3 | throughput per GPU (TFLOP/s/GPU): 124.8 | learning rate: 4.403946E-06 | global batch size:     8 | lm loss: 9.016672E-01 | loss scale: 1.0 | grad norm: 0.697 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
 [2024-11-29 09:29:09] iteration      126/     500 | consumed samples:         1008 | elapsed time per iteration (ms): 475206.1 | throughput per GPU (TFLOP/s/GPU): 109.5 | learning rate: 4.393533E-06 | global batch size:     8 | lm loss: 9.279566E-01 | loss scale: 1.0 | grad norm: 0.846 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
 [2024-11-29 09:37:39] iteration      127/     500 | consumed samples:         1016 | elapsed time per iteration (ms): 510425.2 | throughput per GPU (TFLOP/s/GPU): 102.0 | learning rate: 4.383042E-06 | global batch size:     8 | lm loss: 9.100517E-01 | loss scale: 1.0 | grad norm: 0.751 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
processed_samples 5600 unjoint_samples 5600 joint_samples 15 [665989, 1047023]
processed_samples 5600 unjoint_samples 5600 joint_samples 16 [757814, 1046889]
processed_samples 5600 unjoint_samples 5600 joint_samples 16 [1043995, 106898]
processed_samples 5600 unjoint_samples 5600 joint_samples 16 [313677, 1029588]
processed_samples 5600 unjoint_samples 5600 joint_samples 17 [977175, 340283]
processed_samples 5600 unjoint_samples 5600 joint_samples 16 [138025, 1047184]
processed_samples 5600 unjoint_samples 5600 joint_samples 15 [665989, 1047023]
processed_samples 5600 unjoint_samples 5600 joint_samples 16 [1045883, 398188]
processed_samples 5600 unjoint_samples 5600 joint_samples 16 [757814, 1046889]
processed_samples 5600 unjoint_samples 5600 joint_samples 16 [1043995, 106898]
processed_samples 5600 unjoint_samples 5600 joint_samples 15 [1046338, 898228]
processed_samples 5600 unjoint_samples 5600 joint_samples 17 [977175, 340283]
processed_samples 5600 unjoint_samples 5600 joint_samples 16 [138025, 1047184]
processed_samples 5600 unjoint_samples 5600 joint_samples 16 [313677, 1029588]
processed_samples 5600 unjoint_samples 5600 joint_samples 16 [1045883, 398188]
processed_samples 5600 unjoint_samples 5600 joint_samples 15 [1046338, 898228]
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
 [2024-11-29 09:47:04] iteration      128/     500 | consumed samples:         1024 | elapsed time per iteration (ms): 564469.2 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 4.372474E-06 | global batch size:     8 | lm loss: 9.047135E-01 | loss scale: 1.0 | grad norm: 0.990 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [438013, 1047184]
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1046338, 168892]
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1043995, 440108]
processed_samples 5700 unjoint_samples 5700 joint_samples 17 [977175, 559449]
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1048118, 1046889]
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [610348, 1029588]
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1045883, 910327]
processed_samples 5700 unjoint_samples 5700 joint_samples 15 [991968, 1047023]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [438013, 1047184]
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1046338, 168892]
processed_samples 5700 unjoint_samples 5700 joint_samples 17 [977175, 559449]
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1043995, 440108]
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [610348, 1029588]
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1048118, 1046889]
processed_samples 5700 unjoint_samples 5700 joint_samples 15 [991968, 1047023]
processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1045883, 910327]
 [2024-11-29 09:56:27] iteration      129/     500 | consumed samples:         1032 | elapsed time per iteration (ms): 562972.3 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 4.361829E-06 | global batch size:     8 | lm loss: 8.702211E-01 | loss scale: 1.0 | grad norm: 0.896 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
 [2024-11-29 10:07:13] iteration      130/     500 | consumed samples:         1040 | elapsed time per iteration (ms): 646197.5 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 4.351109E-06 | global batch size:     8 | lm loss: 9.481044E-01 | loss scale: 1.0 | grad norm: 0.885 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b928840] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
 [2024-11-29 10:17:23] iteration      131/     500 | consumed samples:         1048 | elapsed time per iteration (ms): 609849.6 | throughput per GPU (TFLOP/s/GPU): 85.3 | learning rate: 4.340313E-06 | global batch size:     8 | lm loss: 9.409455E-01 | loss scale: 1.0 | grad norm: 0.983 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
processed_samples 5800 unjoint_samples 5800 joint_samples 16 [1043995, 761563]
processed_samples 5800 unjoint_samples 5800 joint_samples 16 [1046338, 489222]
processed_samples 5800 unjoint_samples 5800 joint_samples 17 [172071, 1039900]
processed_samples 5800 unjoint_samples 5800 joint_samples 16 [208456, 1047023]
processed_samples 5800 unjoint_samples 5800 joint_samples 16 [704681, 1047184]
processed_samples 5800 unjoint_samples 5800 joint_samples 17 [274278, 1047055]
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
processed_samples 5800 unjoint_samples 5800 joint_samples 17 [977175, 825885]
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 5800 unjoint_samples 5800 joint_samples 16 [882082, 1029588]
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
processed_samples 5800 unjoint_samples 5800 joint_samples 16 [208456, 1047023]
processed_samples 5800 unjoint_samples 5800 joint_samples 16 [1043995, 761563]
processed_samples 5800 unjoint_samples 5800 joint_samples 16 [1046338, 489222]
processed_samples 5800 unjoint_samples 5800 joint_samples 17 [172071, 1039900]
processed_samples 5800 unjoint_samples 5800 joint_samples 17 [274278, 1047055]
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
processed_samples 5800 unjoint_samples 5800 joint_samples 16 [704681, 1047184]
processed_samples 5800 unjoint_samples 5800 joint_samples 17 [977175, 825885]
processed_samples 5800 unjoint_samples 5800 joint_samples 16 [882082, 1029588]
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x56215b931500] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
 [2024-11-29 10:27:20] iteration      132/     500 | consumed samples:         1056 | elapsed time per iteration (ms): 597434.6 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 4.329441E-06 | global batch size:     8 | lm loss: 9.020258E-01 | loss scale: 1.0 | grad norm: 0.744 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
 [2024-11-29 10:35:23] iteration      133/     500 | consumed samples:         1064 | elapsed time per iteration (ms): 482648.4 | throughput per GPU (TFLOP/s/GPU): 107.8 | learning rate: 4.318496E-06 | global batch size:     8 | lm loss: 8.788584E-01 | loss scale: 1.0 | grad norm: 0.890 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
 [2024-11-29 10:43:43] iteration      134/     500 | consumed samples:         1072 | elapsed time per iteration (ms): 499918.4 | throughput per GPU (TFLOP/s/GPU): 104.1 | learning rate: 4.307476E-06 | global batch size:     8 | lm loss: 9.287664E-01 | loss scale: 1.0 | grad norm: 0.724 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215c3b4780] mmco: unref short failure
 [2024-11-29 10:51:57] iteration      135/     500 | consumed samples:         1080 | elapsed time per iteration (ms): 493979.2 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 4.296382E-06 | global batch size:     8 | lm loss: 9.228923E-01 | loss scale: 1.0 | grad norm: 0.708 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 5900 unjoint_samples 5900 joint_samples 17 [132904, 1028391]
processed_samples 5900 unjoint_samples 5900 joint_samples 17 [1046167, 219972]
processed_samples 5900 unjoint_samples 5900 joint_samples 16 [503140, 1047023]
processed_samples 5900 unjoint_samples 5900 joint_samples 17 [92033, 1047184]
processed_samples 5900 unjoint_samples 5900 joint_samples 16 [1046338, 844199]
processed_samples 5900 unjoint_samples 5900 joint_samples 18 [1004769, 103157]
processed_samples 5900 unjoint_samples 5900 joint_samples 17 [427988, 1039900]
processed_samples 5900 unjoint_samples 5900 joint_samples 17 [541781, 1047055]
processed_samples 5900 unjoint_samples 5900 joint_samples 17 [132904, 1028391]
processed_samples 5900 unjoint_samples 5900 joint_samples 18 [1004769, 103157]
processed_samples 5900 unjoint_samples 5900 joint_samples 17 [92033, 1047184]
processed_samples 5900 unjoint_samples 5900 joint_samples 17 [1046167, 219972]
processed_samples 5900 unjoint_samples 5900 joint_samples 17 [427988, 1039900]
processed_samples 5900 unjoint_samples 5900 joint_samples 16 [503140, 1047023]
processed_samples 5900 unjoint_samples 5900 joint_samples 16 [1046338, 844199]
processed_samples 5900 unjoint_samples 5900 joint_samples 17 [541781, 1047055]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
 [2024-11-29 11:00:06] iteration      136/     500 | consumed samples:         1088 | elapsed time per iteration (ms): 489608.1 | throughput per GPU (TFLOP/s/GPU): 106.3 | learning rate: 4.285215E-06 | global batch size:     8 | lm loss: 9.310520E-01 | loss scale: 1.0 | grad norm: 0.922 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [369661, 1028391]
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [1046338, 178925]
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [1046167, 558366]
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [364407, 1047184]
[h264 @ 0x562161968b80] mmco: unref short failure
processed_samples 6000 unjoint_samples 6000 joint_samples 18 [1004769, 426598]
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [783853, 1039900]
processed_samples 6000 unjoint_samples 6000 joint_samples 16 [793450, 1047023]
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [804873, 1047055]
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [369661, 1028391]
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [1046338, 178925]
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [364407, 1047184]
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [1046167, 558366]
processed_samples 6000 unjoint_samples 6000 joint_samples 16 [793450, 1047023]
processed_samples 6000 unjoint_samples 6000 joint_samples 18 [1004769, 426598]
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [783853, 1039900]
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
processed_samples 6000 unjoint_samples 6000 joint_samples 17 [804873, 1047055]
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
 [2024-11-29 11:10:36] iteration      137/     500 | consumed samples:         1096 | elapsed time per iteration (ms): 629550.2 | throughput per GPU (TFLOP/s/GPU): 82.7 | learning rate: 4.273975E-06 | global batch size:     8 | lm loss: 8.931444E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
 [2024-11-29 11:21:13] iteration      138/     500 | consumed samples:         1104 | elapsed time per iteration (ms): 637652.4 | throughput per GPU (TFLOP/s/GPU): 81.6 | learning rate: 4.262663E-06 | global batch size:     8 | lm loss: 8.654176E-01 | loss scale: 1.0 | grad norm: 0.660 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
processed_samples 6100 unjoint_samples 6100 joint_samples 18 [131317, 1042149]
processed_samples 6100 unjoint_samples 6100 joint_samples 18 [1047271, 102335]
processed_samples 6100 unjoint_samples 6100 joint_samples 17 [1046338, 527140]
processed_samples 6100 unjoint_samples 6100 joint_samples 17 [679324, 1047184]
processed_samples 6100 unjoint_samples 6100 joint_samples 18 [1004769, 794060]
processed_samples 6100 unjoint_samples 6100 joint_samples 16 [1043235, 1047023]
processed_samples 6100 unjoint_samples 6100 joint_samples 17 [628454, 1028391]
processed_samples 6100 unjoint_samples 6100 joint_samples 18 [131317, 1042149]
processed_samples 6100 unjoint_samples 6100 joint_samples 18 [1047271, 102335]
processed_samples 6100 unjoint_samples 6100 joint_samples 17 [628454, 1028391]
processed_samples 6100 unjoint_samples 6100 joint_samples 17 [679324, 1047184]
processed_samples 6100 unjoint_samples 6100 joint_samples 17 [1046167, 832626]
processed_samples 6100 unjoint_samples 6100 joint_samples 17 [1046338, 527140]
processed_samples 6100 unjoint_samples 6100 joint_samples 18 [1004769, 794060]
processed_samples 6100 unjoint_samples 6100 joint_samples 17 [1046167, 832626]
processed_samples 6100 unjoint_samples 6100 joint_samples 16 [1043235, 1047023]
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
 [2024-11-29 11:31:38] iteration      139/     500 | consumed samples:         1112 | elapsed time per iteration (ms): 624648.0 | throughput per GPU (TFLOP/s/GPU): 83.3 | learning rate: 4.251279E-06 | global batch size:     8 | lm loss: 9.249836E-01 | loss scale: 1.0 | grad norm: 0.688 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b931500] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d51327b000] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
 [2024-11-29 11:41:36] iteration      140/     500 | consumed samples:         1120 | elapsed time per iteration (ms): 597821.7 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 4.239823E-06 | global batch size:     8 | lm loss: 8.981265E-01 | loss scale: 1.0 | grad norm: 0.776 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (257016.91, 257017.27)
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
 [2024-11-29 11:52:19] iteration      141/     500 | consumed samples:         1128 | elapsed time per iteration (ms): 386051.8 | throughput per GPU (TFLOP/s/GPU): 134.8 | learning rate: 4.228297E-06 | global batch size:     8 | lm loss: 8.393649E-01 | loss scale: 1.0 | grad norm: 0.650 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
processed_samples 6200 unjoint_samples 6200 joint_samples 17 [320922, 1047023]
processed_samples 6200 unjoint_samples 6200 joint_samples 17 [1046338, 759933]
processed_samples 6200 unjoint_samples 6200 joint_samples 18 [1046167, 138037]
processed_samples 6200 unjoint_samples 6200 joint_samples 19 [1030944, 169552]
processed_samples 6200 unjoint_samples 6200 joint_samples 18 [443597, 1042149]
processed_samples 6200 unjoint_samples 6200 joint_samples 18 [1047271, 408245]
processed_samples 6200 unjoint_samples 6200 joint_samples 17 [883233, 1028391]
processed_samples 6200 unjoint_samples 6200 joint_samples 17 [929179, 1047184]
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
processed_samples 6200 unjoint_samples 6200 joint_samples 17 [320922, 1047023]
processed_samples 6200 unjoint_samples 6200 joint_samples 17 [883233, 1028391]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 6200 unjoint_samples 6200 joint_samples 18 [1046167, 138037]
processed_samples 6200 unjoint_samples 6200 joint_samples 19 [1030944, 169552]
processed_samples 6200 unjoint_samples 6200 joint_samples 17 [1046338, 759933]
processed_samples 6200 unjoint_samples 6200 joint_samples 18 [443597, 1042149]
processed_samples 6200 unjoint_samples 6200 joint_samples 18 [1047271, 408245]
[h264 @ 0x562160944a00] mmco: unref short failure
processed_samples 6200 unjoint_samples 6200 joint_samples 17 [929179, 1047184]
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
 [2024-11-29 12:00:47] iteration      142/     500 | consumed samples:         1136 | elapsed time per iteration (ms): 507722.0 | throughput per GPU (TFLOP/s/GPU): 102.5 | learning rate: 4.216700E-06 | global batch size:     8 | lm loss: 8.758271E-01 | loss scale: 1.0 | grad norm: 0.692 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
 [2024-11-29 12:09:29] iteration      143/     500 | consumed samples:         1144 | elapsed time per iteration (ms): 522359.7 | throughput per GPU (TFLOP/s/GPU): 99.6 | learning rate: 4.205033E-06 | global batch size:     8 | lm loss: 9.256617E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
 [2024-11-29 12:16:51] iteration      144/     500 | consumed samples:         1152 | elapsed time per iteration (ms): 441969.7 | throughput per GPU (TFLOP/s/GPU): 117.7 | learning rate: 4.193297E-06 | global batch size:     8 | lm loss: 8.913149E-01 | loss scale: 1.0 | grad norm: 0.687 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [53144, 1038729]
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1046650, 282275]
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1046167, 459292]
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [770343, 1042149]
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1042680, 77682]
processed_samples 6300 unjoint_samples 6300 joint_samples 19 [1030944, 426577]
processed_samples 6300 unjoint_samples 6300 joint_samples 17 [704195, 1047023]
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1047271, 714129]
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [53144, 1038729]
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [770343, 1042149]
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1046167, 459292]
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1042680, 77682]
processed_samples 6300 unjoint_samples 6300 joint_samples 19 [1030944, 426577]
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1046650, 282275]
processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1047271, 714129]
[h264 @ 0x56215b453c80] mmco: unref short failure
processed_samples 6300 unjoint_samples 6300 joint_samples 17 [704195, 1047023]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215b5c1b00] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1042680, 296069]
[h264 @ 0x56215bb50880] mmco: unref short failure
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [400344, 1038729]
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1046167, 731020]
processed_samples 6400 unjoint_samples 6400 joint_samples 19 [1030944, 707620]
processed_samples 6400 unjoint_samples 6400 joint_samples 17 [1044682, 1047023]
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1041317, 1042149]
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1047271, 1022858]
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1046650, 600778]
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1042680, 296069]
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [400344, 1038729]
[h264 @ 0x55d51407fb00] mmco: unref short failure
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1041317, 1042149]
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1046167, 731020]
processed_samples 6400 unjoint_samples 6400 joint_samples 19 [1030944, 707620]
processed_samples 6400 unjoint_samples 6400 joint_samples 17 [1044682, 1047023]
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1046650, 600778]
processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1047271, 1022858]
 [2024-11-29 12:28:10] iteration      145/     500 | consumed samples:         1160 | elapsed time per iteration (ms): 678831.2 | throughput per GPU (TFLOP/s/GPU): 76.7 | learning rate: 4.181492E-06 | global batch size:     8 | lm loss: 8.918962E-01 | loss scale: 1.0 | grad norm: 0.827 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
 [2024-11-29 12:39:25] iteration      146/     500 | consumed samples:         1168 | elapsed time per iteration (ms): 675348.5 | throughput per GPU (TFLOP/s/GPU): 77.1 | learning rate: 4.169619E-06 | global batch size:     8 | lm loss: 8.583605E-01 | loss scale: 1.0 | grad norm: 0.592 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
 [2024-11-29 12:49:16] iteration      147/     500 | consumed samples:         1176 | elapsed time per iteration (ms): 590427.3 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 4.157677E-06 | global batch size:     8 | lm loss: 8.400425E-01 | loss scale: 1.0 | grad norm: 0.892 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
processed_samples 6500 unjoint_samples 6500 joint_samples 18 [845034, 1038729]
processed_samples 6500 unjoint_samples 6500 joint_samples 18 [280065, 1047023]
processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1042680, 632969]
processed_samples 6500 unjoint_samples 6500 joint_samples 19 [343590, 1048229]
processed_samples 6500 unjoint_samples 6500 joint_samples 19 [252091, 1037762]
processed_samples 6500 unjoint_samples 6500 joint_samples 19 [1030944, 1019548]
processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1046650, 946021]
processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1046167, 998844]
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 6500 unjoint_samples 6500 joint_samples 18 [280065, 1047023]
processed_samples 6500 unjoint_samples 6500 joint_samples 19 [252091, 1037762]
processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1042680, 632969]
processed_samples 6500 unjoint_samples 6500 joint_samples 18 [845034, 1038729]
processed_samples 6500 unjoint_samples 6500 joint_samples 19 [343590, 1048229]
processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1046650, 946021]
processed_samples 6500 unjoint_samples 6500 joint_samples 19 [1030944, 1019548]
processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1046167, 998844]
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
 [2024-11-29 12:59:13] iteration      148/     500 | consumed samples:         1184 | elapsed time per iteration (ms): 597451.7 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 4.145668E-06 | global batch size:     8 | lm loss: 8.973814E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 13:07:27] iteration      149/     500 | consumed samples:         1192 | elapsed time per iteration (ms): 493708.6 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 4.133592E-06 | global batch size:     8 | lm loss: 9.514019E-01 | loss scale: 1.0 | grad norm: 0.727 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
 [2024-11-29 13:16:29] iteration      150/     500 | consumed samples:         1200 | elapsed time per iteration (ms): 541637.5 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 4.121450E-06 | global batch size:     8 | lm loss: 9.203215E-01 | loss scale: 1.0 | grad norm: 0.633 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
 [2024-11-29 13:23:20] iteration      151/     500 | consumed samples:         1208 | elapsed time per iteration (ms): 411185.1 | throughput per GPU (TFLOP/s/GPU): 126.6 | learning rate: 4.109242E-06 | global batch size:     8 | lm loss: 8.703743E-01 | loss scale: 1.0 | grad norm: 0.635 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
 [2024-11-29 13:31:53] iteration      152/     500 | consumed samples:         1216 | elapsed time per iteration (ms): 513393.7 | throughput per GPU (TFLOP/s/GPU): 101.4 | learning rate: 4.096968E-06 | global batch size:     8 | lm loss: 9.042392E-01 | loss scale: 1.0 | grad norm: 0.720 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [1027384, 113895]
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [1027384, 113895]
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [37886, 1044635]
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [37886, 1044635]
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [1046650, 145477]
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [1046650, 145477]
processed_samples 6600 unjoint_samples 6600 joint_samples 18 [668607, 1047023]
processed_samples 6600 unjoint_samples 6600 joint_samples 18 [668607, 1047023]
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [321006, 1030705]
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [321006, 1030705]
processed_samples 6600 unjoint_samples 6600 joint_samples 20 [1031402, 448483]
processed_samples 6600 unjoint_samples 6600 joint_samples 20 [1031402, 448483]
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [549560, 1037762]
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [549560, 1037762]
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [673000, 1048229]
processed_samples 6600 unjoint_samples 6600 joint_samples 19 [673000, 1048229]
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
 [2024-11-29 13:41:29] iteration      153/     500 | consumed samples:         1224 | elapsed time per iteration (ms): 575874.7 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 4.084630E-06 | global batch size:     8 | lm loss: 9.082218E-01 | loss scale: 1.0 | grad norm: 0.646 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 6700 unjoint_samples 6700 joint_samples 19 [337863, 1044635]
processed_samples 6700 unjoint_samples 6700 joint_samples 19 [1027384, 466710]
processed_samples 6700 unjoint_samples 6700 joint_samples 20 [1046189, 10876]
processed_samples 6700 unjoint_samples 6700 joint_samples 20 [1046189, 10876]
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
processed_samples 6700 unjoint_samples 6700 joint_samples 18 [970558, 1047023]
processed_samples 6700 unjoint_samples 6700 joint_samples 19 [337863, 1044635]
processed_samples 6700 unjoint_samples 6700 joint_samples 19 [1046650, 542888]
processed_samples 6700 unjoint_samples 6700 joint_samples 20 [1031402, 787895]
processed_samples 6700 unjoint_samples 6700 joint_samples 19 [1027384, 466710]
processed_samples 6700 unjoint_samples 6700 joint_samples 19 [568568, 1030705]
processed_samples 6700 unjoint_samples 6700 joint_samples 19 [1046650, 542888]
processed_samples 6700 unjoint_samples 6700 joint_samples 20 [1031402, 787895]
processed_samples 6700 unjoint_samples 6700 joint_samples 19 [568568, 1030705]
processed_samples 6700 unjoint_samples 6700 joint_samples 18 [970558, 1047023]
processed_samples 6700 unjoint_samples 6700 joint_samples 19 [826032, 1037762]
processed_samples 6700 unjoint_samples 6700 joint_samples 19 [826032, 1037762]
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
 [2024-11-29 13:51:03] iteration      154/     500 | consumed samples:         1232 | elapsed time per iteration (ms): 573469.6 | throughput per GPU (TFLOP/s/GPU): 90.7 | learning rate: 4.072227E-06 | global batch size:     8 | lm loss: 8.581865E-01 | loss scale: 1.0 | grad norm: 0.790 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
 [2024-11-29 14:00:09] iteration      155/     500 | consumed samples:         1240 | elapsed time per iteration (ms): 546612.5 | throughput per GPU (TFLOP/s/GPU): 95.2 | learning rate: 4.059760E-06 | global batch size:     8 | lm loss: 9.444237E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
 [2024-11-29 14:09:55] iteration      156/     500 | consumed samples:         1248 | elapsed time per iteration (ms): 585879.6 | throughput per GPU (TFLOP/s/GPU): 88.8 | learning rate: 4.047230E-06 | global batch size:     8 | lm loss: 9.171274E-01 | loss scale: 1.0 | grad norm: 0.725 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
processed_samples 6800 unjoint_samples 6800 joint_samples 19 [271761, 1047023]
processed_samples 6800 unjoint_samples 6800 joint_samples 19 [271761, 1047023]
processed_samples 6800 unjoint_samples 6800 joint_samples 19 [1027384, 781072]
processed_samples 6800 unjoint_samples 6800 joint_samples 19 [1027384, 781072]
processed_samples 6800 unjoint_samples 6800 joint_samples 20 [1048170, 25513]
processed_samples 6800 unjoint_samples 6800 joint_samples 20 [1046189, 348499]
processed_samples 6800 unjoint_samples 6800 joint_samples 20 [1048170, 25513]
processed_samples 6800 unjoint_samples 6800 joint_samples 20 [1046189, 348499]
processed_samples 6800 unjoint_samples 6800 joint_samples 21 [109090, 1016327]
processed_samples 6800 unjoint_samples 6800 joint_samples 19 [934686, 1030705]
processed_samples 6800 unjoint_samples 6800 joint_samples 19 [1046650, 876889]
processed_samples 6800 unjoint_samples 6800 joint_samples 21 [109090, 1016327]
processed_samples 6800 unjoint_samples 6800 joint_samples 19 [1046650, 876889]
processed_samples 6800 unjoint_samples 6800 joint_samples 19 [647843, 1044635]
processed_samples 6800 unjoint_samples 6800 joint_samples 19 [647843, 1044635]
processed_samples 6800 unjoint_samples 6800 joint_samples 19 [934686, 1030705]
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
 [2024-11-29 14:17:43] iteration      157/     500 | consumed samples:         1256 | elapsed time per iteration (ms): 467819.2 | throughput per GPU (TFLOP/s/GPU): 111.2 | learning rate: 4.034637E-06 | global batch size:     8 | lm loss: 8.549634E-01 | loss scale: 1.0 | grad norm: 0.703 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
 [2024-11-29 14:26:39] iteration      158/     500 | consumed samples:         1264 | elapsed time per iteration (ms): 535964.1 | throughput per GPU (TFLOP/s/GPU): 97.1 | learning rate: 4.021981E-06 | global batch size:     8 | lm loss: 8.785049E-01 | loss scale: 1.0 | grad norm: 1.639 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
 [2024-11-29 14:33:25] iteration      159/     500 | consumed samples:         1272 | elapsed time per iteration (ms): 406479.1 | throughput per GPU (TFLOP/s/GPU): 128.0 | learning rate: 4.009264E-06 | global batch size:     8 | lm loss: 8.546377E-01 | loss scale: 1.0 | grad norm: 0.659 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 6900 unjoint_samples 6900 joint_samples 20 [12617, 1047814]
processed_samples 6900 unjoint_samples 6900 joint_samples 20 [12617, 1047814]
processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1022264, 234298]
processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1022264, 234298]
processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1046650, 126573]
processed_samples 6900 unjoint_samples 6900 joint_samples 19 [627764, 1047023]
processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1046650, 126573]
processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1048170, 520650]
processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1048170, 520650]
processed_samples 6900 unjoint_samples 6900 joint_samples 19 [627764, 1047023]
processed_samples 6900 unjoint_samples 6900 joint_samples 21 [410801, 1016327]
processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1046189, 859306]
processed_samples 6900 unjoint_samples 6900 joint_samples 19 [1042702, 1044635]
processed_samples 6900 unjoint_samples 6900 joint_samples 19 [1042702, 1044635]
processed_samples 6900 unjoint_samples 6900 joint_samples 21 [410801, 1016327]
processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1046189, 859306]
 [2024-11-29 14:40:53] iteration      160/     500 | consumed samples:         1280 | elapsed time per iteration (ms): 447637.9 | throughput per GPU (TFLOP/s/GPU): 116.3 | learning rate: 3.996486E-06 | global batch size:     8 | lm loss: 8.583610E-01 | loss scale: 1.0 | grad norm: 0.606 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (126718.18, 126718.44)
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
 [2024-11-29 14:51:07] iteration      161/     500 | consumed samples:         1288 | elapsed time per iteration (ms): 487365.2 | throughput per GPU (TFLOP/s/GPU): 106.8 | learning rate: 3.983647E-06 | global batch size:     8 | lm loss: 8.265705E-01 | loss scale: 1.0 | grad norm: 1.024 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1022264, 507371]
processed_samples 7000 unjoint_samples 7000 joint_samples 20 [226430, 1047106]
processed_samples 7000 unjoint_samples 7000 joint_samples 20 [289600, 1047814]
processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1046650, 487061]
[h264 @ 0x56215c37f280] mmco: unref short failure
processed_samples 7000 unjoint_samples 7000 joint_samples 21 [93852, 1042906]
processed_samples 7000 unjoint_samples 7000 joint_samples 19 [898532, 1047023]
processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1048170, 811652]
[h264 @ 0x56215ba1fac0] mmco: unref short failure
processed_samples 7000 unjoint_samples 7000 joint_samples 21 [733359, 1016327]
processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1022264, 507371]
processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1046650, 487061]
processed_samples 7000 unjoint_samples 7000 joint_samples 20 [226430, 1047106]
processed_samples 7000 unjoint_samples 7000 joint_samples 20 [289600, 1047814]
processed_samples 7000 unjoint_samples 7000 joint_samples 19 [898532, 1047023]
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1048170, 811652]
processed_samples 7000 unjoint_samples 7000 joint_samples 21 [93852, 1042906]
processed_samples 7000 unjoint_samples 7000 joint_samples 21 [733359, 1016327]
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b931500] mmco: unref short failure
[h264 @ 0x56215b931500] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
 [2024-11-29 15:04:43] iteration      162/     500 | consumed samples:         1296 | elapsed time per iteration (ms): 815307.1 | throughput per GPU (TFLOP/s/GPU): 63.8 | learning rate: 3.970748E-06 | global batch size:     8 | lm loss: 8.723611E-01 | loss scale: 1.0 | grad norm: 0.678 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
 [2024-11-29 15:14:26] iteration      163/     500 | consumed samples:         1304 | elapsed time per iteration (ms): 583954.2 | throughput per GPU (TFLOP/s/GPU): 89.1 | learning rate: 3.957789E-06 | global batch size:     8 | lm loss: 8.921217E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513966480] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 7100 unjoint_samples 7100 joint_samples 20 [648703, 1047814]
processed_samples 7100 unjoint_samples 7100 joint_samples 20 [560809, 1047106]
processed_samples 7100 unjoint_samples 7100 joint_samples 20 [1046650, 741356]
[h264 @ 0x562160944a00] mmco: unref short failure
processed_samples 7100 unjoint_samples 7100 joint_samples 21 [88917, 1047729]
processed_samples 7100 unjoint_samples 7100 joint_samples 20 [1046650, 741356]
processed_samples 7100 unjoint_samples 7100 joint_samples 21 [1027960, 1025959]
processed_samples 7100 unjoint_samples 7100 joint_samples 21 [88917, 1047729]
processed_samples 7100 unjoint_samples 7100 joint_samples 20 [648703, 1047814]
processed_samples 7100 unjoint_samples 7100 joint_samples 21 [449883, 1042906]
processed_samples 7100 unjoint_samples 7100 joint_samples 20 [560809, 1047106]
processed_samples 7100 unjoint_samples 7100 joint_samples 20 [136766, 1047023]
processed_samples 7100 unjoint_samples 7100 joint_samples 20 [1022264, 887134]
processed_samples 7100 unjoint_samples 7100 joint_samples 20 [1022264, 887134]
processed_samples 7100 unjoint_samples 7100 joint_samples 20 [136766, 1047023]
processed_samples 7100 unjoint_samples 7100 joint_samples 21 [449883, 1042906]
[h264 @ 0x55d513765580] mmco: unref short failure
processed_samples 7100 unjoint_samples 7100 joint_samples 21 [1027960, 1025959]
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
 [2024-11-29 15:26:40] iteration      164/     500 | consumed samples:         1312 | elapsed time per iteration (ms): 733709.8 | throughput per GPU (TFLOP/s/GPU): 70.9 | learning rate: 3.944771E-06 | global batch size:     8 | lm loss: 8.858916E-01 | loss scale: 1.0 | grad norm: 0.848 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
 [2024-11-29 15:34:38] iteration      165/     500 | consumed samples:         1320 | elapsed time per iteration (ms): 478158.0 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 3.931695E-06 | global batch size:     8 | lm loss: 9.041308E-01 | loss scale: 1.0 | grad norm: 1.039 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215de57ac0] mmco: unref short failure
[h264 @ 0x56215de57ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215de57ac0] mmco: unref short failure
[h264 @ 0x56215de57ac0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b931500] mmco: unref short failure
processed_samples 7200 unjoint_samples 7200 joint_samples 21 [117692, 1046949]
processed_samples 7200 unjoint_samples 7200 joint_samples 22 [392276, 1047241]
processed_samples 7200 unjoint_samples 7200 joint_samples 20 [421087, 1047023]
processed_samples 7200 unjoint_samples 7200 joint_samples 20 [862925, 1047106]
processed_samples 7200 unjoint_samples 7200 joint_samples 21 [420804, 1047729]
processed_samples 7200 unjoint_samples 7200 joint_samples 20 [929831, 1047814]
[h264 @ 0x55d51713dd00] mmco: unref short failure
processed_samples 7200 unjoint_samples 7200 joint_samples 20 [1046650, 1013162]
processed_samples 7200 unjoint_samples 7200 joint_samples 21 [117692, 1046949]
processed_samples 7200 unjoint_samples 7200 joint_samples 20 [421087, 1047023]
processed_samples 7200 unjoint_samples 7200 joint_samples 22 [392276, 1047241]
processed_samples 7200 unjoint_samples 7200 joint_samples 21 [755432, 1042906]
processed_samples 7200 unjoint_samples 7200 joint_samples 20 [862925, 1047106]
processed_samples 7200 unjoint_samples 7200 joint_samples 21 [420804, 1047729]
processed_samples 7200 unjoint_samples 7200 joint_samples 21 [755432, 1042906]
processed_samples 7200 unjoint_samples 7200 joint_samples 20 [1046650, 1013162]
processed_samples 7200 unjoint_samples 7200 joint_samples 20 [929831, 1047814]
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
 [2024-11-29 15:43:43] iteration      166/     500 | consumed samples:         1328 | elapsed time per iteration (ms): 544539.0 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 3.918560E-06 | global batch size:     8 | lm loss: 8.623494E-01 | loss scale: 1.0 | grad norm: 0.699 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
 [2024-11-29 15:53:27] iteration      167/     500 | consumed samples:         1336 | elapsed time per iteration (ms): 584550.5 | throughput per GPU (TFLOP/s/GPU): 89.0 | learning rate: 3.905369E-06 | global batch size:     8 | lm loss: 9.145085E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
 [2024-11-29 16:00:34] iteration      168/     500 | consumed samples:         1344 | elapsed time per iteration (ms): 426277.2 | throughput per GPU (TFLOP/s/GPU): 122.1 | learning rate: 3.892120E-06 | global batch size:     8 | lm loss: 8.464835E-01 | loss scale: 1.0 | grad norm: 0.658 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215ba43980] mmco: unref short failure
[h264 @ 0x56215ba43980] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
 [2024-11-29 16:09:00] iteration      169/     500 | consumed samples:         1352 | elapsed time per iteration (ms): 506237.6 | throughput per GPU (TFLOP/s/GPU): 102.8 | learning rate: 3.878815E-06 | global batch size:     8 | lm loss: 8.497138E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 7300 unjoint_samples 7300 joint_samples 21 [1047065, 216542]
processed_samples 7300 unjoint_samples 7300 joint_samples 21 [1043644, 160378]
processed_samples 7300 unjoint_samples 7300 joint_samples 21 [1047065, 216542]
processed_samples 7300 unjoint_samples 7300 joint_samples 21 [1043644, 160378]
processed_samples 7300 unjoint_samples 7300 joint_samples 20 [753653, 1047023]
processed_samples 7300 unjoint_samples 7300 joint_samples 22 [1047180, 11000]
processed_samples 7300 unjoint_samples 7300 joint_samples 22 [1047180, 11000]
processed_samples 7300 unjoint_samples 7300 joint_samples 20 [753653, 1047023]
processed_samples 7300 unjoint_samples 7300 joint_samples 21 [343501, 1046704]
processed_samples 7300 unjoint_samples 7300 joint_samples 21 [343501, 1046704]
processed_samples 7300 unjoint_samples 7300 joint_samples 22 [687319, 1047241]
processed_samples 7300 unjoint_samples 7300 joint_samples 22 [687319, 1047241]
processed_samples 7300 unjoint_samples 7300 joint_samples 21 [532855, 1046949]
processed_samples 7300 unjoint_samples 7300 joint_samples 21 [532855, 1046949]
processed_samples 7300 unjoint_samples 7300 joint_samples 21 [704456, 1047729]
processed_samples 7300 unjoint_samples 7300 joint_samples 21 [704456, 1047729]
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x56215ba43980] mmco: unref short failure
[h264 @ 0x56215ba43980] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215b931500] mmco: unref short failure
[h264 @ 0x56215b931500] mmco: unref short failure
[h264 @ 0x56215b931500] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1026663, 31663]
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1026663, 31663]
 [2024-11-29 16:18:57] iteration      170/     500 | consumed samples:         1360 | elapsed time per iteration (ms): 597046.4 | throughput per GPU (TFLOP/s/GPU): 87.2 | learning rate: 3.865454E-06 | global batch size:     8 | lm loss: 9.044704E-01 | loss scale: 1.0 | grad norm: 1.100 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [967174, 1047729]
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [967174, 1047729]
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1047065, 497602]
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1047065, 497602]
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1043644, 528225]
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [776663, 1046949]
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [557112, 1046704]
processed_samples 7400 unjoint_samples 7400 joint_samples 22 [965373, 1047241]
processed_samples 7400 unjoint_samples 7400 joint_samples 22 [1047180, 332698]
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1043644, 528225]
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [776663, 1046949]
processed_samples 7400 unjoint_samples 7400 joint_samples 21 [557112, 1046704]
processed_samples 7400 unjoint_samples 7400 joint_samples 22 [1047180, 332698]
processed_samples 7400 unjoint_samples 7400 joint_samples 22 [965373, 1047241]
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
 [2024-11-29 16:29:47] iteration      171/     500 | consumed samples:         1368 | elapsed time per iteration (ms): 650220.2 | throughput per GPU (TFLOP/s/GPU): 80.0 | learning rate: 3.852039E-06 | global batch size:     8 | lm loss: 8.603762E-01 | loss scale: 1.0 | grad norm: 0.745 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
 [2024-11-29 16:40:14] iteration      172/     500 | consumed samples:         1376 | elapsed time per iteration (ms): 627029.4 | throughput per GPU (TFLOP/s/GPU): 83.0 | learning rate: 3.838568E-06 | global batch size:     8 | lm loss: 8.916647E-01 | loss scale: 1.0 | grad norm: 0.893 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1026663, 273522]
processed_samples 7500 unjoint_samples 7500 joint_samples 23 [229957, 1047241]
processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1047065, 862282]
processed_samples 7500 unjoint_samples 7500 joint_samples 22 [1041766, 199325]
processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1043644, 821705]
processed_samples 7500 unjoint_samples 7500 joint_samples 22 [1047180, 651690]
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
processed_samples 7500 unjoint_samples 7500 joint_samples 21 [847600, 1046704]
processed_samples 7500 unjoint_samples 7500 joint_samples 23 [229957, 1047241]
processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1037382, 1046949]
processed_samples 7500 unjoint_samples 7500 joint_samples 22 [1041766, 199325]
processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1026663, 273522]
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1047065, 862282]
processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1043644, 821705]
processed_samples 7500 unjoint_samples 7500 joint_samples 22 [1047180, 651690]
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1037382, 1046949]
processed_samples 7500 unjoint_samples 7500 joint_samples 21 [847600, 1046704]
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
 [2024-11-29 16:48:54] iteration      173/     500 | consumed samples:         1384 | elapsed time per iteration (ms): 519428.4 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 3.825043E-06 | global batch size:     8 | lm loss: 8.812805E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
 [2024-11-29 16:58:28] iteration      174/     500 | consumed samples:         1392 | elapsed time per iteration (ms): 573912.8 | throughput per GPU (TFLOP/s/GPU): 90.7 | learning rate: 3.811465E-06 | global batch size:     8 | lm loss: 8.597746E-01 | loss scale: 1.0 | grad norm: 0.601 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
 [2024-11-29 17:07:40] iteration      175/     500 | consumed samples:         1400 | elapsed time per iteration (ms): 552315.1 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 3.797834E-06 | global batch size:     8 | lm loss: 9.125093E-01 | loss scale: 1.0 | grad norm: 0.722 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
 [2024-11-29 17:15:39] iteration      176/     500 | consumed samples:         1408 | elapsed time per iteration (ms): 479317.1 | throughput per GPU (TFLOP/s/GPU): 108.6 | learning rate: 3.784151E-06 | global batch size:     8 | lm loss: 8.529758E-01 | loss scale: 1.0 | grad norm: 1.067 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x562161d4c640] mmco: unref short failure
[h264 @ 0x562161d4c640] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [190869, 997682]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1036661, 193993]
processed_samples 7600 unjoint_samples 7600 joint_samples 23 [492675, 1047241]
processed_samples 7600 unjoint_samples 7600 joint_samples 21 [1026663, 545583]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [403092, 1046949]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1041766, 541558]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1047180, 953300]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1045878, 245108]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [190869, 997682]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1036661, 193993]
processed_samples 7600 unjoint_samples 7600 joint_samples 23 [492675, 1047241]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1041766, 541558]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1047180, 953300]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [403092, 1046949]
processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1045878, 245108]
processed_samples 7600 unjoint_samples 7600 joint_samples 21 [1026663, 545583]
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
 [2024-11-29 17:24:31] iteration      177/     500 | consumed samples:         1416 | elapsed time per iteration (ms): 531521.6 | throughput per GPU (TFLOP/s/GPU): 97.9 | learning rate: 3.770416E-06 | global batch size:     8 | lm loss: 9.014975E-01 | loss scale: 1.0 | grad norm: 0.680 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215ba43980] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 7700 unjoint_samples 7700 joint_samples 23 [228736, 1044116]
processed_samples 7700 unjoint_samples 7700 joint_samples 22 [451395, 997682]
processed_samples 7700 unjoint_samples 7700 joint_samples 22 [702166, 1046949]
processed_samples 7700 unjoint_samples 7700 joint_samples 23 [791887, 1047241]
processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1041766, 881148]
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1036661, 501386]
processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1045878, 578684]
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
processed_samples 7700 unjoint_samples 7700 joint_samples 21 [1026663, 814539]
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
processed_samples 7700 unjoint_samples 7700 joint_samples 23 [228736, 1044116]
processed_samples 7700 unjoint_samples 7700 joint_samples 22 [702166, 1046949]
processed_samples 7700 unjoint_samples 7700 joint_samples 22 [451395, 997682]
processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1041766, 881148]
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1045878, 578684]
processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1036661, 501386]
processed_samples 7700 unjoint_samples 7700 joint_samples 23 [791887, 1047241]
processed_samples 7700 unjoint_samples 7700 joint_samples 21 [1026663, 814539]
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
 [2024-11-29 17:35:52] iteration      178/     500 | consumed samples:         1424 | elapsed time per iteration (ms): 680769.6 | throughput per GPU (TFLOP/s/GPU): 76.4 | learning rate: 3.756630E-06 | global batch size:     8 | lm loss: 7.555795E-01 | loss scale: 1.0 | grad norm: 0.551 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
processed_samples 7800 unjoint_samples 7800 joint_samples 22 [809672, 997682]
processed_samples 7800 unjoint_samples 7800 joint_samples 23 [1047475, 90400]
processed_samples 7800 unjoint_samples 7800 joint_samples 23 [553619, 1044116]
processed_samples 7800 unjoint_samples 7800 joint_samples 22 [1036661, 886667]
processed_samples 7800 unjoint_samples 7800 joint_samples 21 [1040458, 1040562]
processed_samples 7800 unjoint_samples 7800 joint_samples 22 [1045878, 857994]
processed_samples 7800 unjoint_samples 7800 joint_samples 23 [1046959, 1047241]
processed_samples 7800 unjoint_samples 7800 joint_samples 22 [987884, 1046949]
[h264 @ 0x55d516907ac0] mmco: unref short failure
processed_samples 7800 unjoint_samples 7800 joint_samples 23 [1047475, 90400]
processed_samples 7800 unjoint_samples 7800 joint_samples 23 [553619, 1044116]
processed_samples 7800 unjoint_samples 7800 joint_samples 22 [809672, 997682]
processed_samples 7800 unjoint_samples 7800 joint_samples 21 [1040458, 1040562]
processed_samples 7800 unjoint_samples 7800 joint_samples 22 [1036661, 886667]
processed_samples 7800 unjoint_samples 7800 joint_samples 22 [1045878, 857994]
processed_samples 7800 unjoint_samples 7800 joint_samples 22 [987884, 1046949]
processed_samples 7800 unjoint_samples 7800 joint_samples 23 [1046959, 1047241]
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
 [2024-11-29 17:45:43] iteration      179/     500 | consumed samples:         1432 | elapsed time per iteration (ms): 591570.5 | throughput per GPU (TFLOP/s/GPU): 88.0 | learning rate: 3.742793E-06 | global batch size:     8 | lm loss: 9.391325E-01 | loss scale: 1.0 | grad norm: 0.793 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
 [2024-11-29 17:57:18] iteration      180/     500 | consumed samples:         1440 | elapsed time per iteration (ms): 694943.9 | throughput per GPU (TFLOP/s/GPU): 74.9 | learning rate: 3.728906E-06 | global batch size:     8 | lm loss: 8.526995E-01 | loss scale: 1.0 | grad norm: 0.606 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (256118.44, 256118.80)
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
 [2024-11-29 18:10:37] iteration      181/     500 | consumed samples:         1448 | elapsed time per iteration (ms): 542974.2 | throughput per GPU (TFLOP/s/GPU): 95.8 | learning rate: 3.714969E-06 | global batch size:     8 | lm loss: 9.044622E-01 | loss scale: 1.0 | grad norm: 0.630 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
 [2024-11-29 18:18:57] iteration      182/     500 | consumed samples:         1456 | elapsed time per iteration (ms): 499317.5 | throughput per GPU (TFLOP/s/GPU): 104.2 | learning rate: 3.700984E-06 | global batch size:     8 | lm loss: 8.581277E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 18:28:16] iteration      183/     500 | consumed samples:         1464 | elapsed time per iteration (ms): 559222.9 | throughput per GPU (TFLOP/s/GPU): 93.1 | learning rate: 3.686950E-06 | global batch size:     8 | lm loss: 8.661172E-01 | loss scale: 1.0 | grad norm: 0.658 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
 [2024-11-29 18:37:05] iteration      184/     500 | consumed samples:         1472 | elapsed time per iteration (ms): 529442.2 | throughput per GPU (TFLOP/s/GPU): 98.3 | learning rate: 3.672869E-06 | global batch size:     8 | lm loss: 8.653899E-01 | loss scale: 1.0 | grad norm: 0.660 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
processed_samples 7900 unjoint_samples 7900 joint_samples 22 [1047361, 270413]
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [153090, 1038342]
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [173152, 1040042]
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [270168, 1046949]
processed_samples 7900 unjoint_samples 7900 joint_samples 24 [1046959, 305001]
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [1047475, 338689]
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [180848, 1038095]
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [912201, 1044116]
processed_samples 7900 unjoint_samples 7900 joint_samples 22 [1047361, 270413]
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [153090, 1038342]
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [173152, 1040042]
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [270168, 1046949]
processed_samples 7900 unjoint_samples 7900 joint_samples 24 [1046959, 305001]
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [1047475, 338689]
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [180848, 1038095]
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
processed_samples 7900 unjoint_samples 7900 joint_samples 23 [912201, 1044116]
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
 [2024-11-29 18:45:59] iteration      185/     500 | consumed samples:         1480 | elapsed time per iteration (ms): 533290.0 | throughput per GPU (TFLOP/s/GPU): 97.6 | learning rate: 3.658740E-06 | global batch size:     8 | lm loss: 8.636235E-01 | loss scale: 1.0 | grad norm: 0.688 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
processed_samples 8000 unjoint_samples 8000 joint_samples 23 [524852, 1046949]
processed_samples 8000 unjoint_samples 8000 joint_samples 23 [542277, 1038095]
processed_samples 8000 unjoint_samples 8000 joint_samples 23 [420972, 1038342]
processed_samples 8000 unjoint_samples 8000 joint_samples 23 [451751, 1040042]
processed_samples 8000 unjoint_samples 8000 joint_samples 24 [1039352, 179325]
processed_samples 8000 unjoint_samples 8000 joint_samples 24 [1046959, 604582]
processed_samples 8000 unjoint_samples 8000 joint_samples 22 [1047361, 557644]
processed_samples 8000 unjoint_samples 8000 joint_samples 23 [1047475, 823289]
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
processed_samples 8000 unjoint_samples 8000 joint_samples 23 [524852, 1046949]
processed_samples 8000 unjoint_samples 8000 joint_samples 23 [542277, 1038095]
processed_samples 8000 unjoint_samples 8000 joint_samples 23 [420972, 1038342]
processed_samples 8000 unjoint_samples 8000 joint_samples 23 [451751, 1040042]
processed_samples 8000 unjoint_samples 8000 joint_samples 22 [1047361, 557644]
[h264 @ 0x55d51442bac0] mmco: unref short failure
processed_samples 8000 unjoint_samples 8000 joint_samples 24 [1039352, 179325]
processed_samples 8000 unjoint_samples 8000 joint_samples 24 [1046959, 604582]
processed_samples 8000 unjoint_samples 8000 joint_samples 23 [1047475, 823289]
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b931500] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x55d51442bac0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
 [2024-11-29 18:56:18] iteration      186/     500 | consumed samples:         1488 | elapsed time per iteration (ms): 619369.7 | throughput per GPU (TFLOP/s/GPU): 84.0 | learning rate: 3.644565E-06 | global batch size:     8 | lm loss: 8.731977E-01 | loss scale: 1.0 | grad norm: 0.936 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215ba43980] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
processed_samples 8100 unjoint_samples 8100 joint_samples 23 [682605, 1038342]
processed_samples 8100 unjoint_samples 8100 joint_samples 23 [836071, 1046949]
processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1047475, 92479]
processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1046959, 885564]
processed_samples 8100 unjoint_samples 8100 joint_samples 23 [841637, 1038095]
processed_samples 8100 unjoint_samples 8100 joint_samples 23 [826311, 1040042]
processed_samples 8100 unjoint_samples 8100 joint_samples 22 [1047361, 901929]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1039352, 435904]
processed_samples 8100 unjoint_samples 8100 joint_samples 23 [841637, 1038095]
processed_samples 8100 unjoint_samples 8100 joint_samples 23 [682605, 1038342]
processed_samples 8100 unjoint_samples 8100 joint_samples 22 [1047361, 901929]
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1047475, 92479]
processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1039352, 435904]
processed_samples 8100 unjoint_samples 8100 joint_samples 23 [826311, 1040042]
processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1046959, 885564]
processed_samples 8100 unjoint_samples 8100 joint_samples 23 [836071, 1046949]
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
 [2024-11-29 19:06:07] iteration      187/     500 | consumed samples:         1496 | elapsed time per iteration (ms): 589183.6 | throughput per GPU (TFLOP/s/GPU): 88.3 | learning rate: 3.630344E-06 | global batch size:     8 | lm loss: 8.653762E-01 | loss scale: 1.0 | grad norm: 0.609 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
 [2024-11-29 19:15:54] iteration      188/     500 | consumed samples:         1504 | elapsed time per iteration (ms): 586733.4 | throughput per GPU (TFLOP/s/GPU): 88.7 | learning rate: 3.616078E-06 | global batch size:     8 | lm loss: 8.779374E-01 | loss scale: 1.0 | grad norm: 0.729 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 19:24:21] iteration      189/     500 | consumed samples:         1512 | elapsed time per iteration (ms): 506984.3 | throughput per GPU (TFLOP/s/GPU): 102.6 | learning rate: 3.601767E-06 | global batch size:     8 | lm loss: 8.729650E-01 | loss scale: 1.0 | grad norm: 0.679 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
 [2024-11-29 19:33:59] iteration      190/     500 | consumed samples:         1520 | elapsed time per iteration (ms): 577936.4 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 3.587412E-06 | global batch size:     8 | lm loss: 8.692999E-01 | loss scale: 1.0 | grad norm: 0.624 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
 [2024-11-29 19:43:30] iteration      191/     500 | consumed samples:         1528 | elapsed time per iteration (ms): 571544.2 | throughput per GPU (TFLOP/s/GPU): 91.1 | learning rate: 3.573013E-06 | global batch size:     8 | lm loss: 8.751049E-01 | loss scale: 1.0 | grad norm: 0.741 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
processed_samples 8200 unjoint_samples 8200 joint_samples 24 [38114, 1046907]
processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1046784, 112401]
processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1046784, 112401]
processed_samples 8200 unjoint_samples 8200 joint_samples 24 [38114, 1046907]
processed_samples 8200 unjoint_samples 8200 joint_samples 23 [127718, 1036268]
processed_samples 8200 unjoint_samples 8200 joint_samples 25 [88897, 1046533]
processed_samples 8200 unjoint_samples 8200 joint_samples 25 [88897, 1046533]
processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1047475, 479969]
processed_samples 8200 unjoint_samples 8200 joint_samples 23 [127718, 1036268]
processed_samples 8200 unjoint_samples 8200 joint_samples 24 [166601, 1046949]
processed_samples 8200 unjoint_samples 8200 joint_samples 24 [166601, 1046949]
processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1047475, 479969]
processed_samples 8200 unjoint_samples 8200 joint_samples 23 [978353, 1038342]
processed_samples 8200 unjoint_samples 8200 joint_samples 23 [978353, 1038342]
processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1039352, 658160]
processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1039352, 658160]
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
 [2024-11-29 19:52:35] iteration      192/     500 | consumed samples:         1536 | elapsed time per iteration (ms): 544754.7 | throughput per GPU (TFLOP/s/GPU): 95.5 | learning rate: 3.558572E-06 | global batch size:     8 | lm loss: 8.747140E-01 | loss scale: 1.0 | grad norm: 0.574 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
processed_samples 8300 unjoint_samples 8300 joint_samples 25 [541212, 1046533]
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [232124, 1045285]
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1046784, 502722]
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [471537, 1046949]
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [444120, 1046907]
processed_samples 8300 unjoint_samples 8300 joint_samples 23 [530384, 1036268]
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1047475, 823090]
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1039352, 993587]
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [232124, 1045285]
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [444120, 1046907]
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1046784, 502722]
[h264 @ 0x55d5141fd280] mmco: unref short failure
processed_samples 8300 unjoint_samples 8300 joint_samples 23 [530384, 1036268]
processed_samples 8300 unjoint_samples 8300 joint_samples 25 [541212, 1046533]
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [471537, 1046949]
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1047475, 823090]
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1039352, 993587]
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
 [2024-11-29 20:02:10] iteration      193/     500 | consumed samples:         1544 | elapsed time per iteration (ms): 574455.4 | throughput per GPU (TFLOP/s/GPU): 90.6 | learning rate: 3.544088E-06 | global batch size:     8 | lm loss: 8.469065E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
 [2024-11-29 20:09:59] iteration      194/     500 | consumed samples:         1552 | elapsed time per iteration (ms): 469151.3 | throughput per GPU (TFLOP/s/GPU): 110.9 | learning rate: 3.529562E-06 | global batch size:     8 | lm loss: 9.101482E-01 | loss scale: 1.0 | grad norm: 0.728 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b6ccc40] mmco: unref short failure
[h264 @ 0x56215b6ccc40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
processed_samples 8400 unjoint_samples 8400 joint_samples 24 [671898, 1045285]
processed_samples 8400 unjoint_samples 8400 joint_samples 24 [1046784, 764002]
processed_samples 8400 unjoint_samples 8400 joint_samples 25 [269407, 1043464]
processed_samples 8400 unjoint_samples 8400 joint_samples 25 [1047475, 100697]
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
processed_samples 8400 unjoint_samples 8400 joint_samples 25 [876980, 1046533]
processed_samples 8400 unjoint_samples 8400 joint_samples 24 [866861, 1046949]
processed_samples 8400 unjoint_samples 8400 joint_samples 23 [1015102, 1036268]
processed_samples 8400 unjoint_samples 8400 joint_samples 24 [862943, 1046907]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
processed_samples 8400 unjoint_samples 8400 joint_samples 25 [269407, 1043464]
processed_samples 8400 unjoint_samples 8400 joint_samples 25 [1047475, 100697]
processed_samples 8400 unjoint_samples 8400 joint_samples 24 [671898, 1045285]
processed_samples 8400 unjoint_samples 8400 joint_samples 25 [876980, 1046533]
processed_samples 8400 unjoint_samples 8400 joint_samples 24 [1046784, 764002]
processed_samples 8400 unjoint_samples 8400 joint_samples 24 [862943, 1046907]
processed_samples 8400 unjoint_samples 8400 joint_samples 24 [866861, 1046949]
processed_samples 8400 unjoint_samples 8400 joint_samples 23 [1015102, 1036268]
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
 [2024-11-29 20:21:04] iteration      195/     500 | consumed samples:         1560 | elapsed time per iteration (ms): 664879.8 | throughput per GPU (TFLOP/s/GPU): 78.3 | learning rate: 3.514996E-06 | global batch size:     8 | lm loss: 8.420240E-01 | loss scale: 1.0 | grad norm: 0.561 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
 [2024-11-29 20:30:16] iteration      196/     500 | consumed samples:         1568 | elapsed time per iteration (ms): 552301.5 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 3.500388E-06 | global batch size:     8 | lm loss: 8.576379E-01 | loss scale: 1.0 | grad norm: 0.711 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
 [2024-11-29 20:37:35] iteration      197/     500 | consumed samples:         1576 | elapsed time per iteration (ms): 439104.5 | throughput per GPU (TFLOP/s/GPU): 118.5 | learning rate: 3.485741E-06 | global batch size:     8 | lm loss: 8.491023E-01 | loss scale: 1.0 | grad norm: 0.739 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d512dcb440] Missing reference picture, default is 65530
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215ec3c040] Missing reference picture, default is 65530
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
 [2024-11-29 20:51:16] iteration      198/     500 | consumed samples:         1584 | elapsed time per iteration (ms): 821217.4 | throughput per GPU (TFLOP/s/GPU): 63.4 | learning rate: 3.471055E-06 | global batch size:     8 | lm loss: 8.837790E-01 | loss scale: 1.0 | grad norm: 1.253 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
processed_samples 8500 unjoint_samples 8500 joint_samples 24 [1040042, 256314]
processed_samples 8500 unjoint_samples 8500 joint_samples 24 [1040042, 256314]
processed_samples 8500 unjoint_samples 8500 joint_samples 25 [1047475, 407578]
processed_samples 8500 unjoint_samples 8500 joint_samples 25 [1047475, 407578]
processed_samples 8500 unjoint_samples 8500 joint_samples 25 [122087, 1046907]
processed_samples 8500 unjoint_samples 8500 joint_samples 25 [122087, 1046907]
processed_samples 8500 unjoint_samples 8500 joint_samples 25 [1032949, 217199]
processed_samples 8500 unjoint_samples 8500 joint_samples 25 [1032949, 217199]
processed_samples 8500 unjoint_samples 8500 joint_samples 25 [606117, 1043464]
processed_samples 8500 unjoint_samples 8500 joint_samples 26 [100937, 1046533]
processed_samples 8500 unjoint_samples 8500 joint_samples 26 [100937, 1046533]
processed_samples 8500 unjoint_samples 8500 joint_samples 25 [606117, 1043464]
processed_samples 8500 unjoint_samples 8500 joint_samples 24 [973145, 1045285]
processed_samples 8500 unjoint_samples 8500 joint_samples 24 [973145, 1045285]
processed_samples 8500 unjoint_samples 8500 joint_samples 24 [1046784, 1022117]
processed_samples 8500 unjoint_samples 8500 joint_samples 24 [1046784, 1022117]
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
 [2024-11-29 21:01:32] iteration      199/     500 | consumed samples:         1592 | elapsed time per iteration (ms): 615958.8 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 3.456330E-06 | global batch size:     8 | lm loss: 8.532241E-01 | loss scale: 1.0 | grad norm: 0.683 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
 [2024-11-29 21:10:41] iteration      200/     500 | consumed samples:         1600 | elapsed time per iteration (ms): 548343.7 | throughput per GPU (TFLOP/s/GPU): 94.9 | learning rate: 3.441567E-06 | global batch size:     8 | lm loss: 8.384378E-01 | loss scale: 1.0 | grad norm: 0.805 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (261620.54, 261621.03)
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [167071, 1045285]
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1047181, 277034]
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [395778, 1046907]
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1032949, 580621]
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1047475, 684053]
processed_samples 8600 unjoint_samples 8600 joint_samples 26 [439469, 1046533]
[h264 @ 0x56215ec3c040] mmco: unref short failure
processed_samples 8600 unjoint_samples 8600 joint_samples 24 [1040042, 611062]
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [856930, 1043464]
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1047475, 684053]
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [167071, 1045285]
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
processed_samples 8600 unjoint_samples 8600 joint_samples 26 [439469, 1046533]
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1047181, 277034]
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [395778, 1046907]
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1032949, 580621]
processed_samples 8600 unjoint_samples 8600 joint_samples 24 [1040042, 611062]
[h264 @ 0x55d51a1f0840] mmco: unref short failure
processed_samples 8600 unjoint_samples 8600 joint_samples 25 [856930, 1043464]
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
 [2024-11-29 21:26:31] iteration      201/     500 | consumed samples:         1608 | elapsed time per iteration (ms): 688308.4 | throughput per GPU (TFLOP/s/GPU): 75.6 | learning rate: 3.426767E-06 | global batch size:     8 | lm loss: 8.486854E-01 | loss scale: 1.0 | grad norm: 0.611 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
 [2024-11-29 21:35:52] iteration      202/     500 | consumed samples:         1616 | elapsed time per iteration (ms): 561512.8 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 3.411930E-06 | global batch size:     8 | lm loss: 8.823071E-01 | loss scale: 1.0 | grad norm: 0.766 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
processed_samples 8700 unjoint_samples 8700 joint_samples 26 [1047475, 15799]
processed_samples 8700 unjoint_samples 8700 joint_samples 26 [1047475, 15799]
processed_samples 8700 unjoint_samples 8700 joint_samples 25 [905731, 1046907]
processed_samples 8700 unjoint_samples 8700 joint_samples 25 [905731, 1046907]
processed_samples 8700 unjoint_samples 8700 joint_samples 26 [1045874, 85168]
processed_samples 8700 unjoint_samples 8700 joint_samples 26 [1045874, 85168]
processed_samples 8700 unjoint_samples 8700 joint_samples 25 [1047181, 547941]
processed_samples 8700 unjoint_samples 8700 joint_samples 24 [1040042, 863555]
processed_samples 8700 unjoint_samples 8700 joint_samples 25 [549219, 1045285]
processed_samples 8700 unjoint_samples 8700 joint_samples 25 [549219, 1045285]
processed_samples 8700 unjoint_samples 8700 joint_samples 25 [1032949, 903571]
processed_samples 8700 unjoint_samples 8700 joint_samples 25 [1047181, 547941]
[h264 @ 0x56215b75cc40] mmco: unref short failure
processed_samples 8700 unjoint_samples 8700 joint_samples 25 [1032949, 903571]
processed_samples 8700 unjoint_samples 8700 joint_samples 24 [1040042, 863555]
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 8700 unjoint_samples 8700 joint_samples 26 [756152, 1046533]
processed_samples 8700 unjoint_samples 8700 joint_samples 26 [756152, 1046533]
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
 [2024-11-29 21:45:23] iteration      203/     500 | consumed samples:         1624 | elapsed time per iteration (ms): 570436.1 | throughput per GPU (TFLOP/s/GPU): 91.2 | learning rate: 3.397056E-06 | global batch size:     8 | lm loss: 7.746488E-01 | loss scale: 1.0 | grad norm: 0.692 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
 [2024-11-29 21:56:33] iteration      204/     500 | consumed samples:         1632 | elapsed time per iteration (ms): 669819.5 | throughput per GPU (TFLOP/s/GPU): 77.7 | learning rate: 3.382147E-06 | global batch size:     8 | lm loss: 8.163378E-01 | loss scale: 1.0 | grad norm: 0.559 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 22:06:29] iteration      205/     500 | consumed samples:         1640 | elapsed time per iteration (ms): 596426.7 | throughput per GPU (TFLOP/s/GPU): 87.3 | learning rate: 3.367203E-06 | global batch size:     8 | lm loss: 8.137987E-01 | loss scale: 1.0 | grad norm: 0.586 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
processed_samples 8800 unjoint_samples 8800 joint_samples 25 [1047277, 184480]
processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1038728, 266508]
processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1045874, 453231]
processed_samples 8800 unjoint_samples 8800 joint_samples 26 [104759, 1046728]
processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1047475, 320617]
processed_samples 8800 unjoint_samples 8800 joint_samples 25 [873743, 1045285]
processed_samples 8800 unjoint_samples 8800 joint_samples 26 [995735, 1046533]
processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1038728, 266508]
processed_samples 8800 unjoint_samples 8800 joint_samples 26 [104759, 1046728]
processed_samples 8800 unjoint_samples 8800 joint_samples 25 [1047277, 184480]
processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1047475, 320617]
processed_samples 8800 unjoint_samples 8800 joint_samples 25 [1047181, 906245]
processed_samples 8800 unjoint_samples 8800 joint_samples 25 [873743, 1045285]
processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1045874, 453231]
processed_samples 8800 unjoint_samples 8800 joint_samples 25 [1047181, 906245]
processed_samples 8800 unjoint_samples 8800 joint_samples 26 [995735, 1046533]
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
 [2024-11-29 22:15:48] iteration      206/     500 | consumed samples:         1648 | elapsed time per iteration (ms): 559389.8 | throughput per GPU (TFLOP/s/GPU): 93.0 | learning rate: 3.352225E-06 | global batch size:     8 | lm loss: 8.419443E-01 | loss scale: 1.0 | grad norm: 0.620 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
 [2024-11-29 22:24:43] iteration      207/     500 | consumed samples:         1656 | elapsed time per iteration (ms): 534930.3 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 3.337214E-06 | global batch size:     8 | lm loss: 8.349214E-01 | loss scale: 1.0 | grad norm: 0.720 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
 [2024-11-29 22:34:40] iteration      208/     500 | consumed samples:         1664 | elapsed time per iteration (ms): 596473.5 | throughput per GPU (TFLOP/s/GPU): 87.2 | learning rate: 3.322169E-06 | global batch size:     8 | lm loss: 8.279457E-01 | loss scale: 1.0 | grad norm: 0.641 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [123257, 1046667]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [110864, 1045285]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1038728, 538213]
processed_samples 8900 unjoint_samples 8900 joint_samples 25 [1047277, 683180]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [471169, 1046728]
processed_samples 8900 unjoint_samples 8900 joint_samples 27 [1037217, 201421]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [123257, 1046667]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [110864, 1045285]
processed_samples 8900 unjoint_samples 8900 joint_samples 27 [1037217, 201421]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1047475, 676775]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1038728, 538213]
processed_samples 8900 unjoint_samples 8900 joint_samples 25 [1047277, 683180]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1045874, 725442]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1047475, 676775]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [471169, 1046728]
processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1045874, 725442]
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x56215ee0f040] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
processed_samples 9000 unjoint_samples 9000 joint_samples 25 [1047277, 950375]
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [512455, 1045285]
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1038728, 849792]
processed_samples 9000 unjoint_samples 9000 joint_samples 27 [1037217, 530048]
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [464782, 1046667]
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1045874, 994156]
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1047475, 970385]
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [704371, 1046728]
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 9000 unjoint_samples 9000 joint_samples 25 [1047277, 950375]
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [512455, 1045285]
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [464782, 1046667]
processed_samples 9000 unjoint_samples 9000 joint_samples 27 [1037217, 530048]
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1038728, 849792]
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1047475, 970385]
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1045874, 994156]
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
processed_samples 9000 unjoint_samples 9000 joint_samples 26 [704371, 1046728]
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-11-29 22:46:50] iteration      209/     500 | consumed samples:         1672 | elapsed time per iteration (ms): 730291.0 | throughput per GPU (TFLOP/s/GPU): 71.3 | learning rate: 3.307092E-06 | global batch size:     8 | lm loss: 8.303400E-01 | loss scale: 1.0 | grad norm: 0.633 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 22:55:07] iteration      210/     500 | consumed samples:         1680 | elapsed time per iteration (ms): 497043.8 | throughput per GPU (TFLOP/s/GPU): 104.7 | learning rate: 3.291983E-06 | global batch size:     8 | lm loss: 7.948712E-01 | loss scale: 1.0 | grad norm: 0.621 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-29 23:04:42] iteration      211/     500 | consumed samples:         1688 | elapsed time per iteration (ms): 574463.4 | throughput per GPU (TFLOP/s/GPU): 90.6 | learning rate: 3.276843E-06 | global batch size:     8 | lm loss: 9.035169E-01 | loss scale: 1.0 | grad norm: 0.650 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
 [2024-11-29 23:13:44] iteration      212/     500 | consumed samples:         1696 | elapsed time per iteration (ms): 542007.3 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 3.261672E-06 | global batch size:     8 | lm loss: 9.377707E-01 | loss scale: 1.0 | grad norm: 0.808 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 9100 unjoint_samples 9100 joint_samples 26 [225650, 1046705]
processed_samples 9100 unjoint_samples 9100 joint_samples 26 [795430, 1046667]
processed_samples 9100 unjoint_samples 9100 joint_samples 27 [1045874, 296818]
processed_samples 9100 unjoint_samples 9100 joint_samples 27 [250395, 1033542]
processed_samples 9100 unjoint_samples 9100 joint_samples 27 [91770, 1047083]
processed_samples 9100 unjoint_samples 9100 joint_samples 27 [1037217, 874982]
processed_samples 9100 unjoint_samples 9100 joint_samples 26 [937867, 1046728]
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
processed_samples 9100 unjoint_samples 9100 joint_samples 27 [91770, 1047083]
[h264 @ 0x56215f060d00] mmco: unref short failure
processed_samples 9100 unjoint_samples 9100 joint_samples 26 [225650, 1046705]
processed_samples 9100 unjoint_samples 9100 joint_samples 27 [250395, 1033542]
processed_samples 9100 unjoint_samples 9100 joint_samples 26 [895416, 1045285]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 9100 unjoint_samples 9100 joint_samples 26 [937867, 1046728]
processed_samples 9100 unjoint_samples 9100 joint_samples 26 [795430, 1046667]
processed_samples 9100 unjoint_samples 9100 joint_samples 27 [1045874, 296818]
processed_samples 9100 unjoint_samples 9100 joint_samples 26 [895416, 1045285]
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 9100 unjoint_samples 9100 joint_samples 27 [1037217, 874982]
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
 [2024-11-29 23:23:09] iteration      213/     500 | consumed samples:         1704 | elapsed time per iteration (ms): 564882.0 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 3.246472E-06 | global batch size:     8 | lm loss: 8.029411E-01 | loss scale: 1.0 | grad norm: 0.617 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
 [2024-11-29 23:31:09] iteration      214/     500 | consumed samples:         1712 | elapsed time per iteration (ms): 480448.6 | throughput per GPU (TFLOP/s/GPU): 108.3 | learning rate: 3.231242E-06 | global batch size:     8 | lm loss: 8.489534E-01 | loss scale: 1.0 | grad norm: 0.604 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
 [2024-11-29 23:38:57] iteration      215/     500 | consumed samples:         1720 | elapsed time per iteration (ms): 468395.2 | throughput per GPU (TFLOP/s/GPU): 111.1 | learning rate: 3.215984E-06 | global batch size:     8 | lm loss: 8.099984E-01 | loss scale: 1.0 | grad norm: 0.606 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
 [2024-11-29 23:49:10] iteration      216/     500 | consumed samples:         1728 | elapsed time per iteration (ms): 612723.6 | throughput per GPU (TFLOP/s/GPU): 84.9 | learning rate: 3.200697E-06 | global batch size:     8 | lm loss: 8.494784E-01 | loss scale: 1.0 | grad norm: 0.565 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [73991, 1046667]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [1012937, 220560]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [172362, 1046728]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [1045874, 648534]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [73991, 1046667]
processed_samples 9200 unjoint_samples 9200 joint_samples 28 [154822, 1039938]
processed_samples 9200 unjoint_samples 9200 joint_samples 26 [530122, 1046705]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [172362, 1046728]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [1012937, 220560]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [462192, 1033542]
processed_samples 9200 unjoint_samples 9200 joint_samples 28 [154822, 1039938]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [465065, 1047083]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [465065, 1047083]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [462192, 1033542]
processed_samples 9200 unjoint_samples 9200 joint_samples 26 [530122, 1046705]
processed_samples 9200 unjoint_samples 9200 joint_samples 27 [1045874, 648534]
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215bb389c0] mmco: unref short failure
[h264 @ 0x56215bb389c0] mmco: unref short failure
[h264 @ 0x56215bb389c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb389c0] mmco: unref short failure
[h264 @ 0x56215bb389c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [1012937, 522162]
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [460248, 1046728]
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [882129, 1033542]
processed_samples 9300 unjoint_samples 9300 joint_samples 26 [908179, 1046705]
processed_samples 9300 unjoint_samples 9300 joint_samples 28 [552738, 1039938]
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [492165, 1046667]
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [1045874, 936492]
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [810535, 1047083]
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [492165, 1046667]
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [460248, 1046728]
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [1012937, 522162]
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [882129, 1033542]
processed_samples 9300 unjoint_samples 9300 joint_samples 26 [908179, 1046705]
processed_samples 9300 unjoint_samples 9300 joint_samples 28 [552738, 1039938]
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [810535, 1047083]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
processed_samples 9300 unjoint_samples 9300 joint_samples 27 [1045874, 936492]
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
 [2024-11-30 00:00:32] iteration      217/     500 | consumed samples:         1736 | elapsed time per iteration (ms): 682231.1 | throughput per GPU (TFLOP/s/GPU): 76.3 | learning rate: 3.185384E-06 | global batch size:     8 | lm loss: 8.043656E-01 | loss scale: 1.0 | grad norm: 0.589 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
 [2024-11-30 00:08:50] iteration      218/     500 | consumed samples:         1744 | elapsed time per iteration (ms): 497708.4 | throughput per GPU (TFLOP/s/GPU): 104.6 | learning rate: 3.170044E-06 | global batch size:     8 | lm loss: 9.035359E-01 | loss scale: 1.0 | grad norm: 0.570 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
 [2024-11-30 00:19:10] iteration      219/     500 | consumed samples:         1752 | elapsed time per iteration (ms): 619806.1 | throughput per GPU (TFLOP/s/GPU): 84.0 | learning rate: 3.154678E-06 | global batch size:     8 | lm loss: 7.692378E-01 | loss scale: 1.0 | grad norm: 0.671 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
 [2024-11-30 00:27:23] iteration      220/     500 | consumed samples:         1760 | elapsed time per iteration (ms): 493399.7 | throughput per GPU (TFLOP/s/GPU): 105.5 | learning rate: 3.139286E-06 | global batch size:     8 | lm loss: 8.335562E-01 | loss scale: 1.0 | grad norm: 0.625 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (272546.17, 272546.63)
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
processed_samples 9400 unjoint_samples 9400 joint_samples 27 [206860, 1046819]
processed_samples 9400 unjoint_samples 9400 joint_samples 27 [924978, 1046667]
processed_samples 9400 unjoint_samples 9400 joint_samples 28 [158827, 1047083]
processed_samples 9400 unjoint_samples 9400 joint_samples 28 [220743, 1046128]
processed_samples 9400 unjoint_samples 9400 joint_samples 28 [1044569, 139781]
processed_samples 9400 unjoint_samples 9400 joint_samples 27 [1012937, 784226]
processed_samples 9400 unjoint_samples 9400 joint_samples 28 [809095, 1039938]
processed_samples 9400 unjoint_samples 9400 joint_samples 27 [876727, 1046728]
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
processed_samples 9400 unjoint_samples 9400 joint_samples 27 [206860, 1046819]
processed_samples 9400 unjoint_samples 9400 joint_samples 27 [924978, 1046667]
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
processed_samples 9400 unjoint_samples 9400 joint_samples 27 [1012937, 784226]
[h264 @ 0x55d51376ab40] mmco: unref short failure
processed_samples 9400 unjoint_samples 9400 joint_samples 28 [158827, 1047083]
[h264 @ 0x55d51376ab40] mmco: unref short failure
processed_samples 9400 unjoint_samples 9400 joint_samples 28 [1044569, 139781]
processed_samples 9400 unjoint_samples 9400 joint_samples 28 [220743, 1046128]
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
processed_samples 9400 unjoint_samples 9400 joint_samples 28 [809095, 1039938]
[h264 @ 0x55d51326c900] mmco: unref short failure
processed_samples 9400 unjoint_samples 9400 joint_samples 27 [876727, 1046728]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
 [2024-11-30 00:41:52] iteration      221/     500 | consumed samples:         1768 | elapsed time per iteration (ms): 596070.1 | throughput per GPU (TFLOP/s/GPU): 87.3 | learning rate: 3.123870E-06 | global batch size:     8 | lm loss: 8.337880E-01 | loss scale: 1.0 | grad norm: 0.659 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
 [2024-11-30 00:50:18] iteration      222/     500 | consumed samples:         1776 | elapsed time per iteration (ms): 505900.6 | throughput per GPU (TFLOP/s/GPU): 102.9 | learning rate: 3.108430E-06 | global batch size:     8 | lm loss: 8.406662E-01 | loss scale: 1.0 | grad norm: 0.617 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
 [2024-11-30 00:59:00] iteration      223/     500 | consumed samples:         1784 | elapsed time per iteration (ms): 521908.2 | throughput per GPU (TFLOP/s/GPU): 99.7 | learning rate: 3.092966E-06 | global batch size:     8 | lm loss: 8.790482E-01 | loss scale: 1.0 | grad norm: 0.600 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
 [2024-11-30 01:07:02] iteration      224/     500 | consumed samples:         1792 | elapsed time per iteration (ms): 481628.5 | throughput per GPU (TFLOP/s/GPU): 108.1 | learning rate: 3.077479E-06 | global batch size:     8 | lm loss: 8.450036E-01 | loss scale: 1.0 | grad norm: 0.597 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [137705, 1019544]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [137705, 1019544]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1040380, 142585]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1040380, 142585]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1038227, 242054]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1038227, 242054]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [410129, 1047083]
processed_samples 9500 unjoint_samples 9500 joint_samples 29 [1024028, 124413]
processed_samples 9500 unjoint_samples 9500 joint_samples 29 [1024028, 124413]
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
processed_samples 9500 unjoint_samples 9500 joint_samples 27 [512095, 1046819]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [410129, 1047083]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [629736, 1046128]
processed_samples 9500 unjoint_samples 9500 joint_samples 27 [512095, 1046819]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [629736, 1046128]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1044569, 396197]
processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1044569, 396197]
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1038227, 628474]
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [478086, 1019544]
processed_samples 9600 unjoint_samples 9600 joint_samples 29 [1024028, 448485]
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [669891, 1047083]
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1040380, 432600]
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1044569, 719903]
processed_samples 9600 unjoint_samples 9600 joint_samples 27 [795874, 1046819]
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [920557, 1046128]
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [478086, 1019544]
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [669891, 1047083]
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1040380, 432600]
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1038227, 628474]
processed_samples 9600 unjoint_samples 9600 joint_samples 29 [1024028, 448485]
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [920557, 1046128]
processed_samples 9600 unjoint_samples 9600 joint_samples 27 [795874, 1046819]
processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1044569, 719903]
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
 [2024-11-30 01:18:04] iteration      225/     500 | consumed samples:         1800 | elapsed time per iteration (ms): 662503.8 | throughput per GPU (TFLOP/s/GPU): 78.6 | learning rate: 3.061971E-06 | global batch size:     8 | lm loss: 7.138479E-01 | loss scale: 1.0 | grad norm: 0.706 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
 [2024-11-30 01:26:49] iteration      226/     500 | consumed samples:         1808 | elapsed time per iteration (ms): 525016.6 | throughput per GPU (TFLOP/s/GPU): 99.1 | learning rate: 3.046440E-06 | global batch size:     8 | lm loss: 8.335893E-01 | loss scale: 1.0 | grad norm: 0.724 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1013319, 298023]
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1040380, 753225]
processed_samples 9700 unjoint_samples 9700 joint_samples 29 [1028990, 185829]
processed_samples 9700 unjoint_samples 9700 joint_samples 29 [1024028, 795797]
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [805458, 1019544]
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1044569, 1033781]
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [994371, 1047083]
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1038227, 910477]
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1040380, 753225]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 9700 unjoint_samples 9700 joint_samples 29 [1028990, 185829]
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1013319, 298023]
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [994371, 1047083]
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [805458, 1019544]
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1038227, 910477]
processed_samples 9700 unjoint_samples 9700 joint_samples 29 [1024028, 795797]
processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1044569, 1033781]
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
 [2024-11-30 01:39:17] iteration      227/     500 | consumed samples:         1816 | elapsed time per iteration (ms): 747736.8 | throughput per GPU (TFLOP/s/GPU): 69.6 | learning rate: 3.030889E-06 | global batch size:     8 | lm loss: 8.548321E-01 | loss scale: 1.0 | grad norm: 0.774 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
 [2024-11-30 01:49:00] iteration      228/     500 | consumed samples:         1824 | elapsed time per iteration (ms): 583444.9 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 3.015318E-06 | global batch size:     8 | lm loss: 8.605866E-01 | loss scale: 1.0 | grad norm: 1.385 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215d202940] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
 [2024-11-30 01:58:18] iteration      229/     500 | consumed samples:         1832 | elapsed time per iteration (ms): 558010.4 | throughput per GPU (TFLOP/s/GPU): 93.3 | learning rate: 2.999727E-06 | global batch size:     8 | lm loss: 8.125361E-01 | loss scale: 1.0 | grad norm: 0.725 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
 [2024-11-30 02:06:47] iteration      230/     500 | consumed samples:         1840 | elapsed time per iteration (ms): 509049.1 | throughput per GPU (TFLOP/s/GPU): 102.2 | learning rate: 2.984118E-06 | global batch size:     8 | lm loss: 9.170176E-01 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 02:15:32] iteration      231/     500 | consumed samples:         1848 | elapsed time per iteration (ms): 524853.1 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 2.968490E-06 | global batch size:     8 | lm loss: 8.902583E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215afffc40] mmco: unref short failure
[h264 @ 0x56215afffc40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [203755, 1047083]
processed_samples 9800 unjoint_samples 9800 joint_samples 28 [1013319, 591862]
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [81067, 1039558]
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [290606, 1046311]
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [197089, 1044398]
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1038227, 125259]
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1028990, 462177]
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1024028, 1011840]
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [203755, 1047083]
processed_samples 9800 unjoint_samples 9800 joint_samples 28 [1013319, 591862]
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [81067, 1039558]
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1038227, 125259]
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [197089, 1044398]
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [290606, 1046311]
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516a44800] mmco: unref short failure
[h264 @ 0x55d516a44800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1028990, 462177]
processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1024028, 1011840]
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
 [2024-11-30 02:22:56] iteration      232/     500 | consumed samples:         1856 | elapsed time per iteration (ms): 443719.1 | throughput per GPU (TFLOP/s/GPU): 117.3 | learning rate: 2.952845E-06 | global batch size:     8 | lm loss: 7.840450E-01 | loss scale: 1.0 | grad norm: 0.627 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [386430, 1039558]
processed_samples 9900 unjoint_samples 9900 joint_samples 30 [1031356, 468096]
[h264 @ 0x56215d8ddd40] mmco: unref short failure
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [708788, 1044398]
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [1038227, 385286]
processed_samples 9900 unjoint_samples 9900 joint_samples 28 [1013319, 953069]
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [584501, 1047083]
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [588057, 1046311]
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [1028990, 714164]
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [584501, 1047083]
[h264 @ 0x55d51407fb00] mmco: unref short failure
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [386430, 1039558]
processed_samples 9900 unjoint_samples 9900 joint_samples 30 [1031356, 468096]
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [1038227, 385286]
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [588057, 1046311]
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [1028990, 714164]
processed_samples 9900 unjoint_samples 9900 joint_samples 29 [708788, 1044398]
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
processed_samples 9900 unjoint_samples 9900 joint_samples 28 [1013319, 953069]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51327b000] mmco: unref short failure
[h264 @ 0x55d51327b000] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
 [2024-11-30 02:32:45] iteration      233/     500 | consumed samples:         1864 | elapsed time per iteration (ms): 589102.1 | throughput per GPU (TFLOP/s/GPU): 88.3 | learning rate: 2.937183E-06 | global batch size:     8 | lm loss: 8.486030E-01 | loss scale: 1.0 | grad norm: 0.649 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
 [2024-11-30 02:42:05] iteration      234/     500 | consumed samples:         1872 | elapsed time per iteration (ms): 559559.4 | throughput per GPU (TFLOP/s/GPU): 93.0 | learning rate: 2.921504E-06 | global batch size:     8 | lm loss: 8.358369E-01 | loss scale: 1.0 | grad norm: 0.590 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1048318, 146360]
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
processed_samples 10000 unjoint_samples 10000 joint_samples 30 [1039497, 93104]
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1038227, 686845]
processed_samples 10000 unjoint_samples 10000 joint_samples 30 [1031356, 786194]
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [892478, 1046311]
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1028990, 1031122]
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [674954, 1039558]
processed_samples 10000 unjoint_samples 10000 joint_samples 30 [1039497, 93104]
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1048318, 146360]
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [892478, 1046311]
processed_samples 10000 unjoint_samples 10000 joint_samples 30 [1031356, 786194]
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [964007, 1047083]
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [674954, 1039558]
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1038227, 686845]
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [964007, 1047083]
processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1028990, 1031122]
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
 [2024-11-30 02:52:33] iteration      235/     500 | consumed samples:         1880 | elapsed time per iteration (ms): 628444.9 | throughput per GPU (TFLOP/s/GPU): 82.8 | learning rate: 2.905810E-06 | global batch size:     8 | lm loss: 8.173471E-01 | loss scale: 1.0 | grad norm: 0.545 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 03:02:30] iteration      236/     500 | consumed samples:         1888 | elapsed time per iteration (ms): 597310.8 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 2.890101E-06 | global batch size:     8 | lm loss: 7.991673E-01 | loss scale: 1.0 | grad norm: 0.632 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 03:14:13] iteration      237/     500 | consumed samples:         1896 | elapsed time per iteration (ms): 702636.3 | throughput per GPU (TFLOP/s/GPU): 74.1 | learning rate: 2.874378E-06 | global batch size:     8 | lm loss: 8.239874E-01 | loss scale: 1.0 | grad norm: 0.683 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
processed_samples 10100 unjoint_samples 10100 joint_samples 29 [1048318, 428681]
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1039497, 395314]
processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1045781, 248108]
processed_samples 10100 unjoint_samples 10100 joint_samples 30 [959819, 399025]
processed_samples 10100 unjoint_samples 10100 joint_samples 29 [1038227, 931388]
[h264 @ 0x55d5137029c0] mmco: unref short failure
processed_samples 10100 unjoint_samples 10100 joint_samples 29 [911937, 1039558]
[h264 @ 0x55d5137029c0] mmco: unref short failure
processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1031356, 1034010]
processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1048096, 356039]
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
processed_samples 10100 unjoint_samples 10100 joint_samples 29 [1048318, 428681]
processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1039497, 395314]
processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1045781, 248108]
processed_samples 10100 unjoint_samples 10100 joint_samples 30 [959819, 399025]
processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1031356, 1034010]
processed_samples 10100 unjoint_samples 10100 joint_samples 29 [911937, 1039558]
[h264 @ 0x56215d4974c0] mmco: unref short failure
processed_samples 10100 unjoint_samples 10100 joint_samples 29 [1038227, 931388]
[h264 @ 0x56215d4974c0] mmco: unref short failure
processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1048096, 356039]
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
 [2024-11-30 03:23:23] iteration      238/     500 | consumed samples:         1904 | elapsed time per iteration (ms): 550026.1 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 2.858641E-06 | global batch size:     8 | lm loss: 8.139384E-01 | loss scale: 1.0 | grad norm: 0.677 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
 [2024-11-30 03:33:08] iteration      239/     500 | consumed samples:         1912 | elapsed time per iteration (ms): 584777.9 | throughput per GPU (TFLOP/s/GPU): 89.0 | learning rate: 2.842891E-06 | global batch size:     8 | lm loss: 8.176109E-01 | loss scale: 1.0 | grad norm: 0.641 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
 [2024-11-30 03:41:47] iteration      240/     500 | consumed samples:         1920 | elapsed time per iteration (ms): 519125.6 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 2.827129E-06 | global batch size:     8 | lm loss: 7.934645E-01 | loss scale: 1.0 | grad norm: 0.745 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (274797.42, 274798.34)
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
processed_samples 10200 unjoint_samples 10200 joint_samples 31 [198851, 1047383]
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [235979, 1041682]
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [363659, 1010132]
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1045781, 514167]
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1039497, 626871]
processed_samples 10200 unjoint_samples 10200 joint_samples 29 [1048318, 719459]
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [959819, 714686]
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1048096, 696454]
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [363659, 1010132]
processed_samples 10200 unjoint_samples 10200 joint_samples 31 [198851, 1047383]
processed_samples 10200 unjoint_samples 10200 joint_samples 29 [1048318, 719459]
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [235979, 1041682]
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1039497, 626871]
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1045781, 514167]
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [959819, 714686]
processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1048096, 696454]
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215b931500] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
 [2024-11-30 03:58:00] iteration      241/     500 | consumed samples:         1928 | elapsed time per iteration (ms): 697775.1 | throughput per GPU (TFLOP/s/GPU): 74.6 | learning rate: 2.811355E-06 | global batch size:     8 | lm loss: 8.460391E-01 | loss scale: 1.0 | grad norm: 0.582 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
 [2024-11-30 04:09:17] iteration      242/     500 | consumed samples:         1936 | elapsed time per iteration (ms): 676982.5 | throughput per GPU (TFLOP/s/GPU): 76.9 | learning rate: 2.795570E-06 | global batch size:     8 | lm loss: 8.695595E-01 | loss scale: 1.0 | grad norm: 0.695 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [669771, 1010132]
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [80313, 1011814]
processed_samples 10300 unjoint_samples 10300 joint_samples 31 [39135, 1034652]
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [1045781, 761153]
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [460680, 1041682]
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [1039497, 933155]
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [966289, 967539]
[h264 @ 0x56215c18da40] mmco: unref short failure
processed_samples 10300 unjoint_samples 10300 joint_samples 31 [551478, 1047383]
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [669771, 1010132]
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [80313, 1011814]
[h264 @ 0x56215ee100c0] mmco: unref short failure
processed_samples 10300 unjoint_samples 10300 joint_samples 31 [39135, 1034652]
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [1045781, 761153]
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [460680, 1041682]
processed_samples 10300 unjoint_samples 10300 joint_samples 31 [551478, 1047383]
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [1039497, 933155]
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
processed_samples 10300 unjoint_samples 10300 joint_samples 30 [966289, 967539]
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
 [2024-11-30 04:17:54] iteration      243/     500 | consumed samples:         1944 | elapsed time per iteration (ms): 517429.4 | throughput per GPU (TFLOP/s/GPU): 100.6 | learning rate: 2.779775E-06 | global batch size:     8 | lm loss: 8.456820E-01 | loss scale: 1.0 | grad norm: 0.564 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
 [2024-11-30 04:28:27] iteration      244/     500 | consumed samples:         1952 | elapsed time per iteration (ms): 633075.7 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 2.763971E-06 | global batch size:     8 | lm loss: 8.324536E-01 | loss scale: 1.0 | grad norm: 0.590 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
processed_samples 10400 unjoint_samples 10400 joint_samples 30 [435968, 1011814]
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
processed_samples 10400 unjoint_samples 10400 joint_samples 30 [1045781, 975766]
[h264 @ 0x56215be9ff00] mmco: unref short failure
processed_samples 10400 unjoint_samples 10400 joint_samples 31 [183452, 1041048]
processed_samples 10400 unjoint_samples 10400 joint_samples 31 [844115, 1047383]
processed_samples 10400 unjoint_samples 10400 joint_samples 31 [426840, 1034652]
processed_samples 10400 unjoint_samples 10400 joint_samples 31 [172853, 1038532]
processed_samples 10400 unjoint_samples 10400 joint_samples 30 [776971, 1041682]
processed_samples 10400 unjoint_samples 10400 joint_samples 30 [977505, 1010132]
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
processed_samples 10400 unjoint_samples 10400 joint_samples 30 [435968, 1011814]
processed_samples 10400 unjoint_samples 10400 joint_samples 31 [172853, 1038532]
processed_samples 10400 unjoint_samples 10400 joint_samples 30 [977505, 1010132]
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
processed_samples 10400 unjoint_samples 10400 joint_samples 31 [844115, 1047383]
processed_samples 10400 unjoint_samples 10400 joint_samples 31 [183452, 1041048]
processed_samples 10400 unjoint_samples 10400 joint_samples 30 [776971, 1041682]
processed_samples 10400 unjoint_samples 10400 joint_samples 31 [426840, 1034652]
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
processed_samples 10400 unjoint_samples 10400 joint_samples 30 [1045781, 975766]
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
 [2024-11-30 04:41:34] iteration      245/     500 | consumed samples:         1960 | elapsed time per iteration (ms): 786820.4 | throughput per GPU (TFLOP/s/GPU): 66.1 | learning rate: 2.748157E-06 | global batch size:     8 | lm loss: 8.650105E-01 | loss scale: 1.0 | grad norm: 0.570 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
 [2024-11-30 04:49:37] iteration      246/     500 | consumed samples:         1968 | elapsed time per iteration (ms): 483113.6 | throughput per GPU (TFLOP/s/GPU): 107.7 | learning rate: 2.732335E-06 | global batch size:     8 | lm loss: 8.726799E-01 | loss scale: 1.0 | grad norm: 0.605 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
 [2024-11-30 04:59:17] iteration      247/     500 | consumed samples:         1976 | elapsed time per iteration (ms): 580058.3 | throughput per GPU (TFLOP/s/GPU): 89.7 | learning rate: 2.716506E-06 | global batch size:     8 | lm loss: 8.287100E-01 | loss scale: 1.0 | grad norm: 0.680 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
processed_samples 10500 unjoint_samples 10500 joint_samples 31 [1033447, 194045]
[h264 @ 0x56215bcafa40] mmco: unref short failure
processed_samples 10500 unjoint_samples 10500 joint_samples 31 [467014, 1038532]
processed_samples 10500 unjoint_samples 10500 joint_samples 32 [156504, 1048173]
processed_samples 10500 unjoint_samples 10500 joint_samples 31 [218036, 1034239]
processed_samples 10500 unjoint_samples 10500 joint_samples 30 [782697, 1011814]
processed_samples 10500 unjoint_samples 10500 joint_samples 31 [461965, 1041048]
processed_samples 10500 unjoint_samples 10500 joint_samples 31 [1033447, 194045]
processed_samples 10500 unjoint_samples 10500 joint_samples 31 [467014, 1038532]
processed_samples 10500 unjoint_samples 10500 joint_samples 30 [1041885, 1041682]
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
processed_samples 10500 unjoint_samples 10500 joint_samples 31 [712238, 1034652]
processed_samples 10500 unjoint_samples 10500 joint_samples 30 [782697, 1011814]
[h264 @ 0x55d5141fd280] mmco: unref short failure
processed_samples 10500 unjoint_samples 10500 joint_samples 32 [156504, 1048173]
processed_samples 10500 unjoint_samples 10500 joint_samples 31 [218036, 1034239]
[h264 @ 0x5621618e98c0] mmco: unref short failure
processed_samples 10500 unjoint_samples 10500 joint_samples 31 [461965, 1041048]
processed_samples 10500 unjoint_samples 10500 joint_samples 31 [712238, 1034652]
processed_samples 10500 unjoint_samples 10500 joint_samples 30 [1041885, 1041682]
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
 [2024-11-30 05:07:22] iteration      248/     500 | consumed samples:         1984 | elapsed time per iteration (ms): 484634.2 | throughput per GPU (TFLOP/s/GPU): 107.4 | learning rate: 2.700669E-06 | global batch size:     8 | lm loss: 8.482005E-01 | loss scale: 1.0 | grad norm: 0.622 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
 [2024-11-30 05:15:43] iteration      249/     500 | consumed samples:         1992 | elapsed time per iteration (ms): 501138.8 | throughput per GPU (TFLOP/s/GPU): 103.8 | learning rate: 2.684826E-06 | global batch size:     8 | lm loss: 8.958365E-01 | loss scale: 1.0 | grad norm: 0.827 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
 [2024-11-30 05:23:37] iteration      250/     500 | consumed samples:         2000 | elapsed time per iteration (ms): 473859.9 | throughput per GPU (TFLOP/s/GPU): 109.8 | learning rate: 2.668977E-06 | global batch size:     8 | lm loss: 8.291722E-01 | loss scale: 1.0 | grad norm: 0.640 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 10600 unjoint_samples 10600 joint_samples 32 [1041514, 11719]
processed_samples 10600 unjoint_samples 10600 joint_samples 32 [1041514, 11719]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [1037875, 96482]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [1037875, 96482]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [306471, 1047224]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [306471, 1047224]
processed_samples 10600 unjoint_samples 10600 joint_samples 32 [383211, 1048173]
processed_samples 10600 unjoint_samples 10600 joint_samples 32 [383211, 1048173]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [447013, 1034239]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [447013, 1034239]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [1033447, 583281]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [1033447, 583281]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [774199, 1038532]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [698356, 1041048]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [698356, 1041048]
processed_samples 10600 unjoint_samples 10600 joint_samples 31 [774199, 1038532]
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
 [2024-11-30 05:32:01] iteration      251/     500 | consumed samples:         2008 | elapsed time per iteration (ms): 503831.0 | throughput per GPU (TFLOP/s/GPU): 103.3 | learning rate: 2.653124E-06 | global batch size:     8 | lm loss: 8.277208E-01 | loss scale: 1.0 | grad norm: 0.663 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 10700 unjoint_samples 10700 joint_samples 32 [128410, 1045523]
processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1037875, 472499]
processed_samples 10700 unjoint_samples 10700 joint_samples 31 [624422, 1047224]
processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1033447, 841174]
processed_samples 10700 unjoint_samples 10700 joint_samples 32 [1041514, 312682]
processed_samples 10700 unjoint_samples 10700 joint_samples 32 [128410, 1045523]
processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1043363, 1044551]
processed_samples 10700 unjoint_samples 10700 joint_samples 32 [704600, 1048173]
processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1037875, 472499]
processed_samples 10700 unjoint_samples 10700 joint_samples 31 [624422, 1047224]
processed_samples 10700 unjoint_samples 10700 joint_samples 32 [1041514, 312682]
processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1033447, 841174]
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1043363, 1044551]
processed_samples 10700 unjoint_samples 10700 joint_samples 31 [892183, 1034239]
processed_samples 10700 unjoint_samples 10700 joint_samples 32 [704600, 1048173]
[h264 @ 0x56215ec3c040] mmco: unref short failure
processed_samples 10700 unjoint_samples 10700 joint_samples 31 [892183, 1034239]
 [2024-11-30 05:40:40] iteration      252/     500 | consumed samples:         2016 | elapsed time per iteration (ms): 519264.0 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 2.637266E-06 | global batch size:     8 | lm loss: 8.455402E-01 | loss scale: 1.0 | grad norm: 0.609 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
 [2024-11-30 05:50:40] iteration      253/     500 | consumed samples:         2024 | elapsed time per iteration (ms): 600427.9 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 2.621404E-06 | global batch size:     8 | lm loss: 8.403062E-01 | loss scale: 1.0 | grad norm: 1.539 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
 [2024-11-30 05:59:21] iteration      254/     500 | consumed samples:         2032 | elapsed time per iteration (ms): 520958.1 | throughput per GPU (TFLOP/s/GPU): 99.9 | learning rate: 2.605540E-06 | global batch size:     8 | lm loss: 8.314257E-01 | loss scale: 1.0 | grad norm: 0.609 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1035752, 162888]
processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1045921, 146846]
processed_samples 10800 unjoint_samples 10800 joint_samples 32 [315387, 1044551]
processed_samples 10800 unjoint_samples 10800 joint_samples 31 [942124, 1047224]
processed_samples 10800 unjoint_samples 10800 joint_samples 31 [1037875, 835556]
[h264 @ 0x56215f845e80] mmco: unref short failure
processed_samples 10800 unjoint_samples 10800 joint_samples 32 [449003, 1045523]
[h264 @ 0x56215f845e80] mmco: unref short failure
processed_samples 10800 unjoint_samples 10800 joint_samples 33 [1032065, 29516]
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1035752, 162888]
processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1045921, 146846]
processed_samples 10800 unjoint_samples 10800 joint_samples 31 [1037875, 835556]
processed_samples 10800 unjoint_samples 10800 joint_samples 32 [449003, 1045523]
processed_samples 10800 unjoint_samples 10800 joint_samples 32 [315387, 1044551]
processed_samples 10800 unjoint_samples 10800 joint_samples 31 [942124, 1047224]
processed_samples 10800 unjoint_samples 10800 joint_samples 33 [1032065, 29516]
 [2024-11-30 06:09:13] iteration      255/     500 | consumed samples:         2040 | elapsed time per iteration (ms): 591496.3 | throughput per GPU (TFLOP/s/GPU): 88.0 | learning rate: 2.589673E-06 | global batch size:     8 | lm loss: 8.430896E-01 | loss scale: 1.0 | grad norm: 0.614 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1041514, 655987]
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1041514, 655987]
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
 [2024-11-30 06:16:46] iteration      256/     500 | consumed samples:         2048 | elapsed time per iteration (ms): 453571.3 | throughput per GPU (TFLOP/s/GPU): 114.7 | learning rate: 2.573804E-06 | global batch size:     8 | lm loss: 7.856827E-01 | loss scale: 1.0 | grad norm: 0.538 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
 [2024-11-30 06:26:13] iteration      257/     500 | consumed samples:         2056 | elapsed time per iteration (ms): 566571.4 | throughput per GPU (TFLOP/s/GPU): 91.9 | learning rate: 2.557935E-06 | global batch size:     8 | lm loss: 7.880665E-01 | loss scale: 1.0 | grad norm: 0.599 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
 [2024-11-30 06:34:21] iteration      258/     500 | consumed samples:         2064 | elapsed time per iteration (ms): 488091.6 | throughput per GPU (TFLOP/s/GPU): 106.6 | learning rate: 2.542065E-06 | global batch size:     8 | lm loss: 8.652607E-01 | loss scale: 1.0 | grad norm: 0.579 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [190252, 1037338]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [190252, 1037338]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1046966, 215279]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1046966, 215279]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1035752, 459328]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [722456, 1045523]
processed_samples 10900 unjoint_samples 10900 joint_samples 33 [1032065, 351246]
processed_samples 10900 unjoint_samples 10900 joint_samples 33 [1032065, 351246]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1035752, 459328]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1045921, 403937]
processed_samples 10900 unjoint_samples 10900 joint_samples 33 [57951, 1023521]
processed_samples 10900 unjoint_samples 10900 joint_samples 33 [57951, 1023521]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1045921, 403937]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [722456, 1045523]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [575214, 1044551]
processed_samples 10900 unjoint_samples 10900 joint_samples 32 [575214, 1044551]
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
processed_samples 11000 unjoint_samples 11000 joint_samples 32 [489455, 1037338]
processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1046966, 552808]
processed_samples 11000 unjoint_samples 11000 joint_samples 33 [381536, 1023521]
processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1035752, 843715]
processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1045921, 661662]
processed_samples 11000 unjoint_samples 11000 joint_samples 33 [1032065, 659269]
processed_samples 11000 unjoint_samples 11000 joint_samples 32 [893684, 1044551]
processed_samples 11000 unjoint_samples 11000 joint_samples 33 [1044162, 75800]
processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1035752, 843715]
processed_samples 11000 unjoint_samples 11000 joint_samples 32 [489455, 1037338]
processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1046966, 552808]
processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1045921, 661662]
processed_samples 11000 unjoint_samples 11000 joint_samples 33 [381536, 1023521]
processed_samples 11000 unjoint_samples 11000 joint_samples 33 [1032065, 659269]
processed_samples 11000 unjoint_samples 11000 joint_samples 33 [1044162, 75800]
processed_samples 11000 unjoint_samples 11000 joint_samples 32 [893684, 1044551]
 [2024-11-30 06:44:34] iteration      259/     500 | consumed samples:         2072 | elapsed time per iteration (ms): 612328.8 | throughput per GPU (TFLOP/s/GPU): 85.0 | learning rate: 2.526196E-06 | global batch size:     8 | lm loss: 8.281539E-01 | loss scale: 1.0 | grad norm: 0.651 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
 [2024-11-30 06:53:14] iteration      260/     500 | consumed samples:         2080 | elapsed time per iteration (ms): 520294.5 | throughput per GPU (TFLOP/s/GPU): 100.0 | learning rate: 2.510327E-06 | global batch size:     8 | lm loss: 8.611338E-01 | loss scale: 1.0 | grad norm: 0.656 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (266645.05, 266646.24)
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
 [2024-11-30 07:08:37] iteration      261/     500 | consumed samples:         2088 | elapsed time per iteration (ms): 656543.1 | throughput per GPU (TFLOP/s/GPU): 79.3 | learning rate: 2.494460E-06 | global batch size:     8 | lm loss: 8.357545E-01 | loss scale: 1.0 | grad norm: 0.552 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 07:18:10] iteration      262/     500 | consumed samples:         2096 | elapsed time per iteration (ms): 573062.9 | throughput per GPU (TFLOP/s/GPU): 90.8 | learning rate: 2.478596E-06 | global batch size:     8 | lm loss: 8.310040E-01 | loss scale: 1.0 | grad norm: 0.663 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [90226, 1044222]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [90226, 1044222]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [35137, 1023359]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [35137, 1023359]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1045055, 96004]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1045055, 96004]
processed_samples 11100 unjoint_samples 11100 joint_samples 32 [1046966, 1009349]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1044162, 432440]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1044162, 432440]
processed_samples 11100 unjoint_samples 11100 joint_samples 32 [917931, 1037338]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1032065, 1004680]
processed_samples 11100 unjoint_samples 11100 joint_samples 32 [917931, 1037338]
processed_samples 11100 unjoint_samples 11100 joint_samples 32 [1046966, 1009349]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1032065, 1004680]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [626501, 1023521]
processed_samples 11100 unjoint_samples 11100 joint_samples 33 [626501, 1023521]
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
 [2024-11-30 07:27:28] iteration      263/     500 | consumed samples:         2104 | elapsed time per iteration (ms): 558294.7 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 2.462734E-06 | global batch size:     8 | lm loss: 8.269715E-01 | loss scale: 1.0 | grad norm: 0.520 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 07:35:47] iteration      264/     500 | consumed samples:         2112 | elapsed time per iteration (ms): 498352.4 | throughput per GPU (TFLOP/s/GPU): 104.4 | learning rate: 2.446876E-06 | global batch size:     8 | lm loss: 8.242128E-01 | loss scale: 1.0 | grad norm: 0.618 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
 [2024-11-30 07:46:39] iteration      265/     500 | consumed samples:         2120 | elapsed time per iteration (ms): 652257.7 | throughput per GPU (TFLOP/s/GPU): 79.8 | learning rate: 2.431023E-06 | global batch size:     8 | lm loss: 8.014107E-01 | loss scale: 1.0 | grad norm: 0.623 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [370324, 1044222]
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1006570, 223978]
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [272745, 1046448]
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [258289, 1023359]
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1045055, 431566]
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1044162, 740665]
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [937926, 1023521]
processed_samples 11200 unjoint_samples 11200 joint_samples 34 [316008, 1046612]
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1006570, 223978]
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1045055, 431566]
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [272745, 1046448]
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [370324, 1044222]
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [258289, 1023359]
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1044162, 740665]
processed_samples 11200 unjoint_samples 11200 joint_samples 34 [316008, 1046612]
[h264 @ 0x55d51bcf3900] mmco: unref short failure
processed_samples 11200 unjoint_samples 11200 joint_samples 33 [937926, 1023521]
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
 [2024-11-30 07:57:17] iteration      266/     500 | consumed samples:         2128 | elapsed time per iteration (ms): 638306.7 | throughput per GPU (TFLOP/s/GPU): 81.5 | learning rate: 2.415174E-06 | global batch size:     8 | lm loss: 8.306779E-01 | loss scale: 1.0 | grad norm: 0.654 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1006570, 490094]
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [607037, 1044222]
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [783103, 1046448]
processed_samples 11300 unjoint_samples 11300 joint_samples 34 [1034746, 237539]
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1045055, 705299]
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1044162, 1039867]
processed_samples 11300 unjoint_samples 11300 joint_samples 34 [671219, 1046612]
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [602074, 1023359]
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1006570, 490094]
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [607037, 1044222]
processed_samples 11300 unjoint_samples 11300 joint_samples 34 [1034746, 237539]
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [783103, 1046448]
processed_samples 11300 unjoint_samples 11300 joint_samples 34 [671219, 1046612]
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1044162, 1039867]
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [602074, 1023359]
processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1045055, 705299]
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
 [2024-11-30 08:05:16] iteration      267/     500 | consumed samples:         2136 | elapsed time per iteration (ms): 478283.8 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 2.399331E-06 | global batch size:     8 | lm loss: 8.054468E-01 | loss scale: 1.0 | grad norm: 0.702 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
 [2024-11-30 08:13:14] iteration      268/     500 | consumed samples:         2144 | elapsed time per iteration (ms): 478182.6 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 2.383494E-06 | global batch size:     8 | lm loss: 7.975258E-01 | loss scale: 1.0 | grad norm: 1.340 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
 [2024-11-30 08:22:18] iteration      269/     500 | consumed samples:         2152 | elapsed time per iteration (ms): 544529.5 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 2.367665E-06 | global batch size:     8 | lm loss: 8.677651E-01 | loss scale: 1.0 | grad norm: 0.647 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
processed_samples 11400 unjoint_samples 11400 joint_samples 34 [93122, 1042734]
processed_samples 11400 unjoint_samples 11400 joint_samples 34 [950986, 1046612]
processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1045028, 137747]
processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1046638, 420718]
processed_samples 11400 unjoint_samples 11400 joint_samples 33 [847191, 1023359]
processed_samples 11400 unjoint_samples 11400 joint_samples 33 [847191, 1023359]
processed_samples 11400 unjoint_samples 11400 joint_samples 34 [93122, 1042734]
processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1045028, 137747]
processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1034746, 494401]
processed_samples 11400 unjoint_samples 11400 joint_samples 33 [912762, 1044222]
processed_samples 11400 unjoint_samples 11400 joint_samples 34 [950986, 1046612]
processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1046638, 420718]
processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1034746, 494401]
processed_samples 11400 unjoint_samples 11400 joint_samples 33 [912762, 1044222]
processed_samples 11400 unjoint_samples 11400 joint_samples 33 [1006570, 814426]
processed_samples 11400 unjoint_samples 11400 joint_samples 33 [1006570, 814426]
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
 [2024-11-30 08:30:18] iteration      270/     500 | consumed samples:         2160 | elapsed time per iteration (ms): 479974.0 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 2.351843E-06 | global batch size:     8 | lm loss: 8.280485E-01 | loss scale: 1.0 | grad norm: 0.675 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b931500] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x55d513c38380] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
 [2024-11-30 08:38:59] iteration      271/     500 | consumed samples:         2168 | elapsed time per iteration (ms): 520205.1 | throughput per GPU (TFLOP/s/GPU): 100.0 | learning rate: 2.336029E-06 | global batch size:     8 | lm loss: 8.528996E-01 | loss scale: 1.0 | grad norm: 0.622 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
 [2024-11-30 08:50:53] iteration      272/     500 | consumed samples:         2176 | elapsed time per iteration (ms): 713939.2 | throughput per GPU (TFLOP/s/GPU): 72.9 | learning rate: 2.320225E-06 | global batch size:     8 | lm loss: 7.788677E-01 | loss scale: 1.0 | grad norm: 0.693 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
 [2024-11-30 08:58:48] iteration      273/     500 | consumed samples:         2184 | elapsed time per iteration (ms): 475730.7 | throughput per GPU (TFLOP/s/GPU): 109.4 | learning rate: 2.304430E-06 | global batch size:     8 | lm loss: 8.115300E-01 | loss scale: 1.0 | grad norm: 0.748 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1031307, 273919]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [143094, 1032155]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1010911, 127257]
processed_samples 11500 unjoint_samples 11500 joint_samples 35 [178384, 1046612]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1045028, 395499]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [143094, 1032155]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1034746, 745511]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1031307, 273919]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1046638, 666786]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [400243, 1042734]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1010911, 127257]
processed_samples 11500 unjoint_samples 11500 joint_samples 35 [178384, 1046612]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1045028, 395499]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [400243, 1042734]
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1046638, 666786]
[h264 @ 0x55d5141f5f40] mmco: unref short failure
processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1034746, 745511]
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
 [2024-11-30 09:08:50] iteration      274/     500 | consumed samples:         2192 | elapsed time per iteration (ms): 601265.3 | throughput per GPU (TFLOP/s/GPU): 86.6 | learning rate: 2.288645E-06 | global batch size:     8 | lm loss: 7.966631E-01 | loss scale: 1.0 | grad norm: 0.664 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [464788, 1032155]
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1031307, 748776]
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [464788, 1032155]
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1010911, 433321]
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1031307, 748776]
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1045028, 868530]
processed_samples 11600 unjoint_samples 11600 joint_samples 35 [84381, 1028799]
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1046638, 920461]
processed_samples 11600 unjoint_samples 11600 joint_samples 35 [452627, 1046612]
processed_samples 11600 unjoint_samples 11600 joint_samples 35 [84381, 1028799]
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1010911, 433321]
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [717191, 1042734]
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1045028, 868530]
processed_samples 11600 unjoint_samples 11600 joint_samples 35 [452627, 1046612]
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [717191, 1042734]
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1046638, 920461]
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b928840] mmco: unref short failure
[h264 @ 0x56215b928840] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
 [2024-11-30 09:16:09] iteration      275/     500 | consumed samples:         2200 | elapsed time per iteration (ms): 439726.4 | throughput per GPU (TFLOP/s/GPU): 118.3 | learning rate: 2.272871E-06 | global batch size:     8 | lm loss: 8.427922E-01 | loss scale: 1.0 | grad norm: 0.629 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 09:24:45] iteration      276/     500 | consumed samples:         2208 | elapsed time per iteration (ms): 515239.6 | throughput per GPU (TFLOP/s/GPU): 101.0 | learning rate: 2.257109E-06 | global batch size:     8 | lm loss: 8.416483E-01 | loss scale: 1.0 | grad norm: 0.634 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215d5b6680] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
 [2024-11-30 09:36:07] iteration      277/     500 | consumed samples:         2216 | elapsed time per iteration (ms): 682197.4 | throughput per GPU (TFLOP/s/GPU): 76.3 | learning rate: 2.241359E-06 | global batch size:     8 | lm loss: 7.881758E-01 | loss scale: 1.0 | grad norm: 0.592 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 11700 unjoint_samples 11700 joint_samples 34 [1043707, 1043534]
processed_samples 11700 unjoint_samples 11700 joint_samples 35 [284268, 1010837]
processed_samples 11700 unjoint_samples 11700 joint_samples 34 [762107, 1032155]
processed_samples 11700 unjoint_samples 11700 joint_samples 35 [1046638, 200875]
processed_samples 11700 unjoint_samples 11700 joint_samples 35 [1043849, 56536]
processed_samples 11700 unjoint_samples 11700 joint_samples 34 [1010911, 863024]
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
processed_samples 11700 unjoint_samples 11700 joint_samples 34 [1010911, 863024]
processed_samples 11700 unjoint_samples 11700 joint_samples 34 [762107, 1032155]
processed_samples 11700 unjoint_samples 11700 joint_samples 35 [1043849, 56536]
processed_samples 11700 unjoint_samples 11700 joint_samples 35 [284268, 1010837]
processed_samples 11700 unjoint_samples 11700 joint_samples 35 [324394, 1028799]
processed_samples 11700 unjoint_samples 11700 joint_samples 35 [1046638, 200875]
processed_samples 11700 unjoint_samples 11700 joint_samples 34 [1043707, 1043534]
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
processed_samples 11700 unjoint_samples 11700 joint_samples 35 [757912, 1046612]
processed_samples 11700 unjoint_samples 11700 joint_samples 35 [324394, 1028799]
processed_samples 11700 unjoint_samples 11700 joint_samples 35 [757912, 1046612]
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
 [2024-11-30 09:46:41] iteration      278/     500 | consumed samples:         2224 | elapsed time per iteration (ms): 634027.2 | throughput per GPU (TFLOP/s/GPU): 82.1 | learning rate: 2.225622E-06 | global batch size:     8 | lm loss: 8.349802E-01 | loss scale: 1.0 | grad norm: 0.896 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d516b0c1c0] mmco: unref short failure
[h264 @ 0x55d516b0c1c0] mmco: unref short failure
 [2024-11-30 09:55:01] iteration      279/     500 | consumed samples:         2232 | elapsed time per iteration (ms): 500105.8 | throughput per GPU (TFLOP/s/GPU): 104.1 | learning rate: 2.209899E-06 | global batch size:     8 | lm loss: 7.836739E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 10:05:10] iteration      280/     500 | consumed samples:         2240 | elapsed time per iteration (ms): 609166.8 | throughput per GPU (TFLOP/s/GPU): 85.4 | learning rate: 2.194190E-06 | global batch size:     8 | lm loss: 8.044785E-01 | loss scale: 1.0 | grad norm: 1.376 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (171105.91, 171106.25)
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215c030640] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
 [2024-11-30 10:16:11] iteration      281/     500 | consumed samples:         2248 | elapsed time per iteration (ms): 489721.5 | throughput per GPU (TFLOP/s/GPU): 106.3 | learning rate: 2.178496E-06 | global batch size:     8 | lm loss: 8.157408E-01 | loss scale: 1.0 | grad norm: 0.658 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [52957, 1044877]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1043849, 366158]
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [581835, 1028799]
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1047582, 226100]
processed_samples 11800 unjoint_samples 11800 joint_samples 36 [1035315, 165408]
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1043425, 136197]
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [650628, 1010837]
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1046638, 508637]
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [52957, 1044877]
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1043849, 366158]
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1046638, 508637]
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [581835, 1028799]
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
processed_samples 11800 unjoint_samples 11800 joint_samples 36 [1035315, 165408]
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1047582, 226100]
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1043425, 136197]
processed_samples 11800 unjoint_samples 11800 joint_samples 35 [650628, 1010837]
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [353851, 1044877]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1043849, 681942]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [859390, 1028799]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1047582, 516773]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [960099, 1010837]
processed_samples 11900 unjoint_samples 11900 joint_samples 36 [1035315, 604488]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1043425, 503852]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1046638, 860170]
[h264 @ 0x55d514afd580] mmco: unref short failure
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [353851, 1044877]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1043425, 503852]
processed_samples 11900 unjoint_samples 11900 joint_samples 36 [1035315, 604488]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1047582, 516773]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1043849, 681942]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [859390, 1028799]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [960099, 1010837]
processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1046638, 860170]
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
 [2024-11-30 10:27:34] iteration      282/     500 | consumed samples:         2256 | elapsed time per iteration (ms): 683305.7 | throughput per GPU (TFLOP/s/GPU): 76.2 | learning rate: 2.162817E-06 | global batch size:     8 | lm loss: 7.566515E-01 | loss scale: 1.0 | grad norm: 0.545 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x55d513632740] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
 [2024-11-30 10:35:49] iteration      283/     500 | consumed samples:         2264 | elapsed time per iteration (ms): 495043.0 | throughput per GPU (TFLOP/s/GPU): 105.1 | learning rate: 2.147155E-06 | global batch size:     8 | lm loss: 8.410187E-01 | loss scale: 1.0 | grad norm: 0.790 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
 [2024-11-30 10:46:45] iteration      284/     500 | consumed samples:         2272 | elapsed time per iteration (ms): 655690.9 | throughput per GPU (TFLOP/s/GPU): 79.4 | learning rate: 2.131510E-06 | global batch size:     8 | lm loss: 7.616670E-01 | loss scale: 1.0 | grad norm: 0.654 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
processed_samples 12000 unjoint_samples 12000 joint_samples 36 [242872, 1046135]
processed_samples 12000 unjoint_samples 12000 joint_samples 36 [1003444, 360813]
[h264 @ 0x562163486b40] mmco: unref short failure
processed_samples 12000 unjoint_samples 12000 joint_samples 36 [236351, 1037505]
processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1043849, 983242]
processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1047582, 865435]
processed_samples 12000 unjoint_samples 12000 joint_samples 36 [1035315, 978031]
processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1043425, 801378]
[h264 @ 0x55d51713dd00] mmco: unref short failure
processed_samples 12000 unjoint_samples 12000 joint_samples 35 [796309, 1044877]
processed_samples 12000 unjoint_samples 12000 joint_samples 36 [1003444, 360813]
processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1043425, 801378]
processed_samples 12000 unjoint_samples 12000 joint_samples 36 [236351, 1037505]
processed_samples 12000 unjoint_samples 12000 joint_samples 36 [242872, 1046135]
processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1043849, 983242]
processed_samples 12000 unjoint_samples 12000 joint_samples 35 [796309, 1044877]
[h264 @ 0x55d51376ab40] mmco: unref short failure
processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1047582, 865435]
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
processed_samples 12000 unjoint_samples 12000 joint_samples 36 [1035315, 978031]
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
 [2024-11-30 10:57:08] iteration      285/     500 | consumed samples:         2280 | elapsed time per iteration (ms): 623028.4 | throughput per GPU (TFLOP/s/GPU): 83.5 | learning rate: 2.115882E-06 | global batch size:     8 | lm loss: 8.154826E-01 | loss scale: 1.0 | grad norm: 0.667 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
 [2024-11-30 11:06:58] iteration      286/     500 | consumed samples:         2288 | elapsed time per iteration (ms): 589484.3 | throughput per GPU (TFLOP/s/GPU): 88.3 | learning rate: 2.100273E-06 | global batch size:     8 | lm loss: 8.646793E-01 | loss scale: 1.0 | grad norm: 0.998 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215ef24280] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
 [2024-11-30 11:15:14] iteration      287/     500 | consumed samples:         2296 | elapsed time per iteration (ms): 496441.8 | throughput per GPU (TFLOP/s/GPU): 104.8 | learning rate: 2.084682E-06 | global batch size:     8 | lm loss: 7.899673E-01 | loss scale: 1.0 | grad norm: 0.709 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 11:23:49] iteration      288/     500 | consumed samples:         2304 | elapsed time per iteration (ms): 515215.7 | throughput per GPU (TFLOP/s/GPU): 101.0 | learning rate: 2.069111E-06 | global batch size:     8 | lm loss: 8.179791E-01 | loss scale: 1.0 | grad norm: 0.600 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 11:33:46] iteration      289/     500 | consumed samples:         2312 | elapsed time per iteration (ms): 597082.0 | throughput per GPU (TFLOP/s/GPU): 87.2 | learning rate: 2.053560E-06 | global batch size:     8 | lm loss: 8.552551E-01 | loss scale: 1.0 | grad norm: 0.682 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [993547, 135709]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [145136, 1046501]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1048014, 119398]
processed_samples 12100 unjoint_samples 12100 joint_samples 37 [196550, 1047132]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1043849, 351598]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [532513, 1037505]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [613456, 1046135]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1003444, 680620]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [145136, 1046501]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [993547, 135709]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1048014, 119398]
processed_samples 12100 unjoint_samples 12100 joint_samples 37 [196550, 1047132]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1043849, 351598]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [532513, 1037505]
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1003444, 680620]
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
processed_samples 12100 unjoint_samples 12100 joint_samples 36 [613456, 1046135]
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x55d516ad1940] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [993547, 473142]
processed_samples 12200 unjoint_samples 12200 joint_samples 37 [504885, 1047132]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1048014, 333278]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [804067, 1037505]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [433142, 1046501]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1003444, 914513]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1043849, 693272]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [931466, 1046135]
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
processed_samples 12200 unjoint_samples 12200 joint_samples 37 [504885, 1047132]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [993547, 473142]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1043849, 693272]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [433142, 1046501]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1048014, 333278]
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1003444, 914513]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [804067, 1037505]
processed_samples 12200 unjoint_samples 12200 joint_samples 36 [931466, 1046135]
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
 [2024-11-30 11:46:13] iteration      290/     500 | consumed samples:         2320 | elapsed time per iteration (ms): 746951.5 | throughput per GPU (TFLOP/s/GPU): 69.7 | learning rate: 2.038029E-06 | global batch size:     8 | lm loss: 8.304491E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 11:54:12] iteration      291/     500 | consumed samples:         2328 | elapsed time per iteration (ms): 478261.6 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 2.022521E-06 | global batch size:     8 | lm loss: 7.944888E-01 | loss scale: 1.0 | grad norm: 0.555 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 12:03:11] iteration      292/     500 | consumed samples:         2336 | elapsed time per iteration (ms): 539350.2 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 2.007034E-06 | global batch size:     8 | lm loss: 7.973379E-01 | loss scale: 1.0 | grad norm: 1.126 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
processed_samples 12300 unjoint_samples 12300 joint_samples 36 [1043849, 990824]
processed_samples 12300 unjoint_samples 12300 joint_samples 37 [251464, 1046430]
processed_samples 12300 unjoint_samples 12300 joint_samples 36 [688565, 1046501]
processed_samples 12300 unjoint_samples 12300 joint_samples 36 [1048014, 615495]
processed_samples 12300 unjoint_samples 12300 joint_samples 37 [1046797, 218884]
processed_samples 12300 unjoint_samples 12300 joint_samples 36 [993547, 791688]
processed_samples 12300 unjoint_samples 12300 joint_samples 37 [251464, 1046430]
processed_samples 12300 unjoint_samples 12300 joint_samples 36 [1043849, 990824]
processed_samples 12300 unjoint_samples 12300 joint_samples 37 [1046831, 299400]
processed_samples 12300 unjoint_samples 12300 joint_samples 36 [993547, 791688]
processed_samples 12300 unjoint_samples 12300 joint_samples 36 [688565, 1046501]
processed_samples 12300 unjoint_samples 12300 joint_samples 37 [1046797, 218884]
processed_samples 12300 unjoint_samples 12300 joint_samples 36 [1048014, 615495]
processed_samples 12300 unjoint_samples 12300 joint_samples 37 [1046831, 299400]
processed_samples 12300 unjoint_samples 12300 joint_samples 37 [983928, 1047132]
processed_samples 12300 unjoint_samples 12300 joint_samples 37 [983928, 1047132]
 [2024-11-30 12:12:44] iteration      293/     500 | consumed samples:         2344 | elapsed time per iteration (ms): 573404.3 | throughput per GPU (TFLOP/s/GPU): 90.8 | learning rate: 1.991570E-06 | global batch size:     8 | lm loss: 8.226759E-01 | loss scale: 1.0 | grad norm: 0.719 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 12:23:57] iteration      294/     500 | consumed samples:         2352 | elapsed time per iteration (ms): 672631.5 | throughput per GPU (TFLOP/s/GPU): 77.4 | learning rate: 1.976130E-06 | global batch size:     8 | lm loss: 8.111385E-01 | loss scale: 1.0 | grad norm: 0.669 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
 [2024-11-30 12:33:13] iteration      295/     500 | consumed samples:         2360 | elapsed time per iteration (ms): 555682.2 | throughput per GPU (TFLOP/s/GPU): 93.7 | learning rate: 1.960714E-06 | global batch size:     8 | lm loss: 8.239835E-01 | loss scale: 1.0 | grad norm: 0.590 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
processed_samples 12400 unjoint_samples 12400 joint_samples 37 [354583, 1031883]
processed_samples 12400 unjoint_samples 12400 joint_samples 37 [106069, 1043662]
processed_samples 12400 unjoint_samples 12400 joint_samples 37 [565622, 1046430]
processed_samples 12400 unjoint_samples 12400 joint_samples 37 [1046797, 502036]
processed_samples 12400 unjoint_samples 12400 joint_samples 38 [1008915, 359357]
processed_samples 12400 unjoint_samples 12400 joint_samples 37 [1046831, 588798]
processed_samples 12400 unjoint_samples 12400 joint_samples 36 [1026683, 1026981]
processed_samples 12400 unjoint_samples 12400 joint_samples 36 [939259, 1046501]
[h264 @ 0x55d517608ac0] mmco: unref short failure
processed_samples 12400 unjoint_samples 12400 joint_samples 37 [354583, 1031883]
processed_samples 12400 unjoint_samples 12400 joint_samples 37 [106069, 1043662]
processed_samples 12400 unjoint_samples 12400 joint_samples 37 [565622, 1046430]
processed_samples 12400 unjoint_samples 12400 joint_samples 37 [1046797, 502036]
processed_samples 12400 unjoint_samples 12400 joint_samples 37 [1046831, 588798]
processed_samples 12400 unjoint_samples 12400 joint_samples 38 [1008915, 359357]
processed_samples 12400 unjoint_samples 12400 joint_samples 36 [1026683, 1026981]
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
processed_samples 12400 unjoint_samples 12400 joint_samples 36 [939259, 1046501]
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
 [2024-11-30 12:41:33] iteration      296/     500 | consumed samples:         2368 | elapsed time per iteration (ms): 499937.2 | throughput per GPU (TFLOP/s/GPU): 104.1 | learning rate: 1.945322E-06 | global batch size:     8 | lm loss: 8.253292E-01 | loss scale: 1.0 | grad norm: 0.627 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
 [2024-11-30 12:54:29] iteration      297/     500 | consumed samples:         2376 | elapsed time per iteration (ms): 776033.1 | throughput per GPU (TFLOP/s/GPU): 67.1 | learning rate: 1.929956E-06 | global batch size:     8 | lm loss: 8.470016E-01 | loss scale: 1.0 | grad norm: 0.718 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1040955, 213455]
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [392874, 1043662]
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [706780, 1031883]
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1046526, 269576]
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [943416, 1046430]
[h264 @ 0x55d51368aac0] mmco: unref short failure
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1046797, 834976]
processed_samples 12500 unjoint_samples 12500 joint_samples 38 [1008915, 805986]
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
processed_samples 12500 unjoint_samples 12500 joint_samples 38 [24850, 1043320]
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1046526, 269576]
processed_samples 12500 unjoint_samples 12500 joint_samples 38 [1008915, 805986]
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1040955, 213455]
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [706780, 1031883]
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [392874, 1043662]
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1046797, 834976]
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
processed_samples 12500 unjoint_samples 12500 joint_samples 37 [943416, 1046430]
processed_samples 12500 unjoint_samples 12500 joint_samples 38 [24850, 1043320]
 [2024-11-30 13:02:29] iteration      298/     500 | consumed samples:         2384 | elapsed time per iteration (ms): 480085.5 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 1.914616E-06 | global batch size:     8 | lm loss: 7.771384E-01 | loss scale: 1.0 | grad norm: 0.622 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
 [2024-11-30 13:12:05] iteration      299/     500 | consumed samples:         2392 | elapsed time per iteration (ms): 575897.8 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 1.899303E-06 | global batch size:     8 | lm loss: 8.349781E-01 | loss scale: 1.0 | grad norm: 0.591 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x56215c3e4200] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
 [2024-11-30 13:19:46] iteration      300/     500 | consumed samples:         2400 | elapsed time per iteration (ms): 461560.0 | throughput per GPU (TFLOP/s/GPU): 112.7 | learning rate: 1.884016E-06 | global batch size:     8 | lm loss: 8.001071E-01 | loss scale: 1.0 | grad norm: 0.654 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (282566.86, 282567.81)
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
 [2024-11-30 13:33:37] iteration      301/     500 | consumed samples:         2408 | elapsed time per iteration (ms): 547954.3 | throughput per GPU (TFLOP/s/GPU): 95.0 | learning rate: 1.868758E-06 | global batch size:     8 | lm loss: 8.713289E-01 | loss scale: 1.0 | grad norm: 0.673 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 12600 unjoint_samples 12600 joint_samples 39 [17506, 1046889]
processed_samples 12600 unjoint_samples 12600 joint_samples 39 [17506, 1046889]
processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1028954, 1031883]
processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1028954, 1031883]
processed_samples 12600 unjoint_samples 12600 joint_samples 38 [141516, 1046478]
processed_samples 12600 unjoint_samples 12600 joint_samples 38 [141516, 1046478]
processed_samples 12600 unjoint_samples 12600 joint_samples 38 [1035194, 366712]
processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1040955, 464830]
processed_samples 12600 unjoint_samples 12600 joint_samples 38 [1035194, 366712]
processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1040955, 464830]
processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1046526, 539818]
processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1046526, 539818]
processed_samples 12600 unjoint_samples 12600 joint_samples 38 [280121, 1043320]
processed_samples 12600 unjoint_samples 12600 joint_samples 37 [641125, 1043662]
processed_samples 12600 unjoint_samples 12600 joint_samples 38 [280121, 1043320]
processed_samples 12600 unjoint_samples 12600 joint_samples 37 [641125, 1043662]
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
 [2024-11-30 13:46:17] iteration      302/     500 | consumed samples:         2416 | elapsed time per iteration (ms): 760100.8 | throughput per GPU (TFLOP/s/GPU): 68.5 | learning rate: 1.853528E-06 | global batch size:     8 | lm loss: 8.170073E-01 | loss scale: 1.0 | grad norm: 0.612 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d51327b000] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
processed_samples 12700 unjoint_samples 12700 joint_samples 38 [1045818, 284715]
processed_samples 12700 unjoint_samples 12700 joint_samples 38 [1035194, 660677]
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
processed_samples 12700 unjoint_samples 12700 joint_samples 39 [293695, 1046889]
processed_samples 12700 unjoint_samples 12700 joint_samples 37 [1046526, 893046]
processed_samples 12700 unjoint_samples 12700 joint_samples 38 [494410, 1043320]
processed_samples 12700 unjoint_samples 12700 joint_samples 37 [1040955, 855781]
processed_samples 12700 unjoint_samples 12700 joint_samples 38 [562324, 1046478]
processed_samples 12700 unjoint_samples 12700 joint_samples 37 [949756, 1043662]
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d517c83800] mmco: unref short failure
[h264 @ 0x55d517c83800] mmco: unref short failure
[h264 @ 0x55d517c83800] mmco: unref short failure
[h264 @ 0x55d517c83800] mmco: unref short failure
processed_samples 12700 unjoint_samples 12700 joint_samples 38 [1045818, 284715]
[h264 @ 0x55d5145449c0] mmco: unref short failure
processed_samples 12700 unjoint_samples 12700 joint_samples 39 [293695, 1046889]
processed_samples 12700 unjoint_samples 12700 joint_samples 37 [1046526, 893046]
processed_samples 12700 unjoint_samples 12700 joint_samples 37 [1040955, 855781]
processed_samples 12700 unjoint_samples 12700 joint_samples 38 [1035194, 660677]
processed_samples 12700 unjoint_samples 12700 joint_samples 38 [494410, 1043320]
processed_samples 12700 unjoint_samples 12700 joint_samples 38 [562324, 1046478]
[h264 @ 0x55d517c83800] mmco: unref short failure
processed_samples 12700 unjoint_samples 12700 joint_samples 37 [949756, 1043662]
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d517c83800] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
 [2024-11-30 13:56:06] iteration      303/     500 | consumed samples:         2424 | elapsed time per iteration (ms): 588726.4 | throughput per GPU (TFLOP/s/GPU): 88.4 | learning rate: 1.838328E-06 | global batch size:     8 | lm loss: 8.639438E-01 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517c83800] mmco: unref short failure
[h264 @ 0x55d517c83800] mmco: unref short failure
[h264 @ 0x55d517c83800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
 [2024-11-30 14:05:05] iteration      304/     500 | consumed samples:         2432 | elapsed time per iteration (ms): 538865.4 | throughput per GPU (TFLOP/s/GPU): 96.6 | learning rate: 1.823157E-06 | global batch size:     8 | lm loss: 8.561846E-01 | loss scale: 1.0 | grad norm: 0.625 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d517c83800] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
 [2024-11-30 14:13:55] iteration      305/     500 | consumed samples:         2440 | elapsed time per iteration (ms): 530809.8 | throughput per GPU (TFLOP/s/GPU): 98.0 | learning rate: 1.808017E-06 | global batch size:     8 | lm loss: 7.760535E-01 | loss scale: 1.0 | grad norm: 0.571 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1046526, 139839]
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1045818, 554980]
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [857097, 1046478]
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [219037, 1045808]
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1043268, 112077]
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
processed_samples 12800 unjoint_samples 12800 joint_samples 39 [560760, 1046889]
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1035194, 898744]
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [778423, 1043320]
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1046526, 139839]
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [857097, 1046478]
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1045818, 554980]
[h264 @ 0x56215bb50880] mmco: unref short failure
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [219037, 1045808]
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1043268, 112077]
processed_samples 12800 unjoint_samples 12800 joint_samples 39 [560760, 1046889]
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [778423, 1043320]
processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1035194, 898744]
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d517c83800] mmco: unref short failure
[h264 @ 0x55d517c83800] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51327b000] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
 [2024-11-30 14:23:15] iteration      306/     500 | consumed samples:         2448 | elapsed time per iteration (ms): 559066.7 | throughput per GPU (TFLOP/s/GPU): 93.1 | learning rate: 1.792908E-06 | global batch size:     8 | lm loss: 7.674619E-01 | loss scale: 1.0 | grad norm: 0.564 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d512f0e280] mmco: unref short failure
[h264 @ 0x55d512f0e280] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
 [2024-11-30 14:31:20] iteration      307/     500 | consumed samples:         2456 | elapsed time per iteration (ms): 485233.6 | throughput per GPU (TFLOP/s/GPU): 107.2 | learning rate: 1.777831E-06 | global batch size:     8 | lm loss: 8.237488E-01 | loss scale: 1.0 | grad norm: 0.547 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d512f0e280] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d512f0e280] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d512f0e280] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
processed_samples 12900 unjoint_samples 12900 joint_samples 39 [1046397, 85695]
processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1046526, 528940]
processed_samples 12900 unjoint_samples 12900 joint_samples 38 [467957, 1045808]
processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1043268, 511206]
processed_samples 12900 unjoint_samples 12900 joint_samples 39 [1046275, 62598]
processed_samples 12900 unjoint_samples 12900 joint_samples 39 [838773, 1046889]
processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1045818, 1002455]
processed_samples 12900 unjoint_samples 12900 joint_samples 39 [1046397, 85695]
processed_samples 12900 unjoint_samples 12900 joint_samples 38 [467957, 1045808]
processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1046526, 528940]
processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1043268, 511206]
processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1021311, 1043320]
processed_samples 12900 unjoint_samples 12900 joint_samples 39 [1046275, 62598]
processed_samples 12900 unjoint_samples 12900 joint_samples 39 [838773, 1046889]
processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1045818, 1002455]
processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1021311, 1043320]
[h264 @ 0x55d513738040] mmco: unref short failure
[h264 @ 0x55d513738040] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
 [2024-11-30 14:40:57] iteration      308/     500 | consumed samples:         2464 | elapsed time per iteration (ms): 577346.8 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 1.762786E-06 | global batch size:     8 | lm loss: 7.823042E-01 | loss scale: 1.0 | grad norm: 0.562 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513738040] mmco: unref short failure
[h264 @ 0x55d513738040] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513738040] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513738040] mmco: unref short failure
[h264 @ 0x55d513738040] mmco: unref short failure
 [2024-11-30 14:50:22] iteration      309/     500 | consumed samples:         2472 | elapsed time per iteration (ms): 564389.6 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 1.747775E-06 | global batch size:     8 | lm loss: 8.073215E-01 | loss scale: 1.0 | grad norm: 0.614 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513738040] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d512f0e280] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d512f0e280] mmco: unref short failure
[h264 @ 0x55d512f0e280] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
 [2024-11-30 15:01:02] iteration      310/     500 | consumed samples:         2480 | elapsed time per iteration (ms): 640592.5 | throughput per GPU (TFLOP/s/GPU): 81.2 | learning rate: 1.732797E-06 | global batch size:     8 | lm loss: 7.878639E-01 | loss scale: 1.0 | grad norm: 0.630 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d512f0e280] mmco: unref short failure
[h264 @ 0x55d512f0e280] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215d2fbb40] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
processed_samples 13000 unjoint_samples 13000 joint_samples 39 [1046275, 369384]
processed_samples 13000 unjoint_samples 13000 joint_samples 38 [956646, 1045808]
processed_samples 13000 unjoint_samples 13000 joint_samples 39 [240077, 1047349]
processed_samples 13000 unjoint_samples 13000 joint_samples 39 [1046397, 334750]
processed_samples 13000 unjoint_samples 13000 joint_samples 40 [1045623, 91370]
processed_samples 13000 unjoint_samples 13000 joint_samples 39 [223695, 1044019]
processed_samples 13000 unjoint_samples 13000 joint_samples 38 [1046526, 814297]
processed_samples 13000 unjoint_samples 13000 joint_samples 38 [1043268, 872584]
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
processed_samples 13000 unjoint_samples 13000 joint_samples 39 [1046275, 369384]
processed_samples 13000 unjoint_samples 13000 joint_samples 40 [1045623, 91370]
processed_samples 13000 unjoint_samples 13000 joint_samples 39 [240077, 1047349]
processed_samples 13000 unjoint_samples 13000 joint_samples 39 [1046397, 334750]
processed_samples 13000 unjoint_samples 13000 joint_samples 39 [223695, 1044019]
processed_samples 13000 unjoint_samples 13000 joint_samples 38 [1046526, 814297]
processed_samples 13000 unjoint_samples 13000 joint_samples 38 [956646, 1045808]
[h264 @ 0x55d516907ac0] mmco: unref short failure
processed_samples 13000 unjoint_samples 13000 joint_samples 38 [1043268, 872584]
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215d2fbb40] mmco: unref short failure
[h264 @ 0x56215d2fbb40] mmco: unref short failure
[h264 @ 0x56215c1c3180] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
 [2024-11-30 15:10:17] iteration      311/     500 | consumed samples:         2488 | elapsed time per iteration (ms): 554512.4 | throughput per GPU (TFLOP/s/GPU): 93.8 | learning rate: 1.717853E-06 | global batch size:     8 | lm loss: 7.991486E-01 | loss scale: 1.0 | grad norm: 0.605 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
 [2024-11-30 15:19:24] iteration      312/     500 | consumed samples:         2496 | elapsed time per iteration (ms): 546881.2 | throughput per GPU (TFLOP/s/GPU): 95.2 | learning rate: 1.702944E-06 | global batch size:     8 | lm loss: 8.143560E-01 | loss scale: 1.0 | grad norm: 0.614 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
 [2024-11-30 15:28:58] iteration      313/     500 | consumed samples:         2504 | elapsed time per iteration (ms): 574661.5 | throughput per GPU (TFLOP/s/GPU): 90.6 | learning rate: 1.688070E-06 | global batch size:     8 | lm loss: 8.001103E-01 | loss scale: 1.0 | grad norm: 0.603 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [103466, 1024651]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046275, 709019]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [93796, 1029821]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [571757, 1047349]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [103466, 1024651]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [609128, 1044019]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046397, 678441]
processed_samples 13100 unjoint_samples 13100 joint_samples 40 [1045623, 394135]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046417, 212551]
processed_samples 13100 unjoint_samples 13100 joint_samples 40 [1045623, 394135]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [93796, 1029821]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [571757, 1047349]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046397, 678441]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046275, 709019]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [609128, 1044019]
processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046417, 212551]
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56216223ed40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d514dfd500] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c1c3180] mmco: unref short failure
[h264 @ 0x56215c1c3180] mmco: unref short failure
[h264 @ 0x56215c1c3180] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [410289, 1024651]
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [453053, 1029821]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [909462, 1044019]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [825074, 1047349]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046417, 469797]
processed_samples 13200 unjoint_samples 13200 joint_samples 40 [1045623, 665843]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046275, 1020331]
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046397, 931380]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [410289, 1024651]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046397, 931380]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [909462, 1044019]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [453053, 1029821]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046417, 469797]
processed_samples 13200 unjoint_samples 13200 joint_samples 40 [1045623, 665843]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [825074, 1047349]
processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046275, 1020331]
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-11-30 15:40:10] iteration      314/     500 | consumed samples:         2512 | elapsed time per iteration (ms): 671337.2 | throughput per GPU (TFLOP/s/GPU): 77.5 | learning rate: 1.673233E-06 | global batch size:     8 | lm loss: 8.582259E-01 | loss scale: 1.0 | grad norm: 0.582 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
 [2024-11-30 15:49:06] iteration      315/     500 | consumed samples:         2520 | elapsed time per iteration (ms): 536085.1 | throughput per GPU (TFLOP/s/GPU): 97.1 | learning rate: 1.658433E-06 | global batch size:     8 | lm loss: 8.089479E-01 | loss scale: 1.0 | grad norm: 0.701 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
 [2024-11-30 15:58:40] iteration      316/     500 | consumed samples:         2528 | elapsed time per iteration (ms): 574309.7 | throughput per GPU (TFLOP/s/GPU): 90.6 | learning rate: 1.643670E-06 | global batch size:     8 | lm loss: 8.455462E-01 | loss scale: 1.0 | grad norm: 0.599 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
 [2024-11-30 16:08:19] iteration      317/     500 | consumed samples:         2536 | elapsed time per iteration (ms): 578986.6 | throughput per GPU (TFLOP/s/GPU): 89.9 | learning rate: 1.628945E-06 | global batch size:     8 | lm loss: 8.240018E-01 | loss scale: 1.0 | grad norm: 0.727 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
 [2024-11-30 16:23:11] iteration      318/     500 | consumed samples:         2544 | elapsed time per iteration (ms): 891890.1 | throughput per GPU (TFLOP/s/GPU): 58.3 | learning rate: 1.614259E-06 | global batch size:     8 | lm loss: 8.104389E-01 | loss scale: 1.0 | grad norm: 0.606 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1028881, 517520]
processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1046397, 230256]
processed_samples 13300 unjoint_samples 13300 joint_samples 40 [63461, 1047349]
processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1046275, 280185]
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
processed_samples 13300 unjoint_samples 13300 joint_samples 41 [399929, 835418]
processed_samples 13300 unjoint_samples 13300 joint_samples 39 [1046417, 860811]
processed_samples 13300 unjoint_samples 13300 joint_samples 39 [744636, 1029821]
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
processed_samples 13300 unjoint_samples 13300 joint_samples 39 [794114, 1024651]
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1046397, 230256]
processed_samples 13300 unjoint_samples 13300 joint_samples 40 [63461, 1047349]
processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1046275, 280185]
processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1028881, 517520]
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
processed_samples 13300 unjoint_samples 13300 joint_samples 41 [399929, 835418]
[h264 @ 0x56215b75cc40] mmco: unref short failure
processed_samples 13300 unjoint_samples 13300 joint_samples 39 [1046417, 860811]
processed_samples 13300 unjoint_samples 13300 joint_samples 39 [744636, 1029821]
processed_samples 13300 unjoint_samples 13300 joint_samples 39 [794114, 1024651]
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
 [2024-11-30 16:32:13] iteration      319/     500 | consumed samples:         2552 | elapsed time per iteration (ms): 542489.5 | throughput per GPU (TFLOP/s/GPU): 95.9 | learning rate: 1.599612E-06 | global batch size:     8 | lm loss: 8.013833E-01 | loss scale: 1.0 | grad norm: 0.677 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d516fe4400] Missing reference picture, default is 65530
[h264 @ 0x55d516fe4400] Missing reference picture, default is 65530
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215b35e3c0] Missing reference picture, default is 65530
[h264 @ 0x56215b35e3c0] Missing reference picture, default is 65530
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [275376, 1047349]
[h264 @ 0x55d517608ac0] mmco: unref short failure
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [96442, 1045599]
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1028881, 803492]
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046275, 632442]
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1044023, 7939]
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046397, 549541]
processed_samples 13400 unjoint_samples 13400 joint_samples 41 [673681, 835418]
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046469, 2275]
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [275376, 1047349]
[h264 @ 0x56215b706800] mmco: unref short failure
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [96442, 1045599]
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046275, 632442]
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1044023, 7939]
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046469, 2275]
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1028881, 803492]
processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046397, 549541]
processed_samples 13400 unjoint_samples 13400 joint_samples 41 [673681, 835418]
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
 [2024-11-30 16:42:59] iteration      320/     500 | consumed samples:         2560 | elapsed time per iteration (ms): 645208.1 | throughput per GPU (TFLOP/s/GPU): 80.7 | learning rate: 1.585004E-06 | global batch size:     8 | lm loss: 8.406156E-01 | loss scale: 1.0 | grad norm: 0.720 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (157375.32, 157375.75)
 [2024-11-30 16:55:10] iteration      321/     500 | consumed samples:         2568 | elapsed time per iteration (ms): 573666.6 | throughput per GPU (TFLOP/s/GPU): 90.7 | learning rate: 1.570438E-06 | global batch size:     8 | lm loss: 8.569485E-01 | loss scale: 1.0 | grad norm: 0.749 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [789281, 1047349]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046469, 307868]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046469, 307868]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046275, 850373]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1044023, 290292]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [789281, 1047349]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1044023, 290292]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [344892, 1045599]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046275, 850373]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [344892, 1045599]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1031113, 1032192]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046397, 1039659]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1031113, 1032192]
processed_samples 13500 unjoint_samples 13500 joint_samples 41 [886594, 888903]
processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046397, 1039659]
processed_samples 13500 unjoint_samples 13500 joint_samples 41 [886594, 888903]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215c97ef40] mmco: unref short failure
[h264 @ 0x56215c97ef40] mmco: unref short failure
 [2024-11-30 17:04:49] iteration      322/     500 | consumed samples:         2576 | elapsed time per iteration (ms): 579528.4 | throughput per GPU (TFLOP/s/GPU): 89.8 | learning rate: 1.555912E-06 | global batch size:     8 | lm loss: 8.402985E-01 | loss scale: 1.0 | grad norm: 0.602 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
 [2024-11-30 17:13:00] iteration      323/     500 | consumed samples:         2584 | elapsed time per iteration (ms): 491025.6 | throughput per GPU (TFLOP/s/GPU): 106.0 | learning rate: 1.541428E-06 | global batch size:     8 | lm loss: 8.409790E-01 | loss scale: 1.0 | grad norm: 0.678 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
 [2024-11-30 17:24:00] iteration      324/     500 | consumed samples:         2592 | elapsed time per iteration (ms): 659534.5 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 1.526987E-06 | global batch size:     8 | lm loss: 8.275973E-01 | loss scale: 1.0 | grad norm: 0.780 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
 [2024-11-30 17:33:31] iteration      325/     500 | consumed samples:         2600 | elapsed time per iteration (ms): 571165.1 | throughput per GPU (TFLOP/s/GPU): 91.1 | learning rate: 1.512588E-06 | global batch size:     8 | lm loss: 8.160167E-01 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
 [2024-11-30 17:45:33] iteration      326/     500 | consumed samples:         2608 | elapsed time per iteration (ms): 722290.8 | throughput per GPU (TFLOP/s/GPU): 72.0 | learning rate: 1.498233E-06 | global batch size:     8 | lm loss: 8.057153E-01 | loss scale: 1.0 | grad norm: 0.536 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 13600 unjoint_samples 13600 joint_samples 40 [625978, 1045599]
processed_samples 13600 unjoint_samples 13600 joint_samples 40 [625978, 1045599]
processed_samples 13600 unjoint_samples 13600 joint_samples 40 [1046469, 616229]
processed_samples 13600 unjoint_samples 13600 joint_samples 40 [1046469, 616229]
processed_samples 13600 unjoint_samples 13600 joint_samples 41 [196349, 1038113]
processed_samples 13600 unjoint_samples 13600 joint_samples 41 [196349, 1038113]
processed_samples 13600 unjoint_samples 13600 joint_samples 41 [1045341, 106633]
processed_samples 13600 unjoint_samples 13600 joint_samples 41 [1045341, 106633]
processed_samples 13600 unjoint_samples 13600 joint_samples 41 [199051, 1045463]
processed_samples 13600 unjoint_samples 13600 joint_samples 41 [199051, 1045463]
processed_samples 13600 unjoint_samples 13600 joint_samples 42 [1046810, 9387]
processed_samples 13600 unjoint_samples 13600 joint_samples 42 [1046810, 9387]
processed_samples 13600 unjoint_samples 13600 joint_samples 41 [1046620, 355512]
processed_samples 13600 unjoint_samples 13600 joint_samples 40 [1044023, 603098]
processed_samples 13600 unjoint_samples 13600 joint_samples 41 [1046620, 355512]
processed_samples 13600 unjoint_samples 13600 joint_samples 40 [1044023, 603098]
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56216223ed40] mmco: unref short failure
[h264 @ 0x56216223ed40] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
processed_samples 13700 unjoint_samples 13700 joint_samples 41 [1046620, 898755]
processed_samples 13700 unjoint_samples 13700 joint_samples 41 [1045341, 466511]
processed_samples 13700 unjoint_samples 13700 joint_samples 41 [435373, 1038113]
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
processed_samples 13700 unjoint_samples 13700 joint_samples 42 [1046810, 283747]
processed_samples 13700 unjoint_samples 13700 joint_samples 41 [486705, 1045463]
processed_samples 13700 unjoint_samples 13700 joint_samples 40 [1046469, 851558]
processed_samples 13700 unjoint_samples 13700 joint_samples 40 [1044023, 870758]
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
processed_samples 13700 unjoint_samples 13700 joint_samples 40 [910623, 1045599]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
processed_samples 13700 unjoint_samples 13700 joint_samples 41 [1046620, 898755]
processed_samples 13700 unjoint_samples 13700 joint_samples 41 [1045341, 466511]
processed_samples 13700 unjoint_samples 13700 joint_samples 41 [435373, 1038113]
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
processed_samples 13700 unjoint_samples 13700 joint_samples 40 [1046469, 851558]
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
processed_samples 13700 unjoint_samples 13700 joint_samples 42 [1046810, 283747]
processed_samples 13700 unjoint_samples 13700 joint_samples 41 [486705, 1045463]
processed_samples 13700 unjoint_samples 13700 joint_samples 40 [1044023, 870758]
processed_samples 13700 unjoint_samples 13700 joint_samples 40 [910623, 1045599]
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
 [2024-11-30 17:57:19] iteration      327/     500 | consumed samples:         2616 | elapsed time per iteration (ms): 705428.7 | throughput per GPU (TFLOP/s/GPU): 73.8 | learning rate: 1.483922E-06 | global batch size:     8 | lm loss: 8.687828E-01 | loss scale: 1.0 | grad norm: 0.626 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
 [2024-11-30 18:04:37] iteration      328/     500 | consumed samples:         2624 | elapsed time per iteration (ms): 438551.2 | throughput per GPU (TFLOP/s/GPU): 118.7 | learning rate: 1.469656E-06 | global batch size:     8 | lm loss: 7.690409E-01 | loss scale: 1.0 | grad norm: 0.599 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 18:14:37] iteration      329/     500 | consumed samples:         2632 | elapsed time per iteration (ms): 599202.9 | throughput per GPU (TFLOP/s/GPU): 86.9 | learning rate: 1.455435E-06 | global batch size:     8 | lm loss: 8.852109E-01 | loss scale: 1.0 | grad norm: 0.685 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
 [2024-11-30 18:23:39] iteration      330/     500 | consumed samples:         2640 | elapsed time per iteration (ms): 541993.7 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 1.441260E-06 | global batch size:     8 | lm loss: 7.726980E-01 | loss scale: 1.0 | grad norm: 0.566 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
processed_samples 13800 unjoint_samples 13800 joint_samples 42 [184223, 1045284]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [124485, 1046118]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [1046955, 122399]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [149934, 1047710]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [1045341, 735665]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [702685, 1038113]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [764513, 1045463]
processed_samples 13800 unjoint_samples 13800 joint_samples 42 [1046810, 638210]
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
processed_samples 13800 unjoint_samples 13800 joint_samples 42 [184223, 1045284]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [124485, 1046118]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [1046955, 122399]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [149934, 1047710]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [1045341, 735665]
processed_samples 13800 unjoint_samples 13800 joint_samples 42 [1046810, 638210]
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [702685, 1038113]
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
processed_samples 13800 unjoint_samples 13800 joint_samples 41 [764513, 1045463]
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
 [2024-11-30 18:34:42] iteration      331/     500 | consumed samples:         2648 | elapsed time per iteration (ms): 663480.9 | throughput per GPU (TFLOP/s/GPU): 78.4 | learning rate: 1.427131E-06 | global batch size:     8 | lm loss: 7.973595E-01 | loss scale: 1.0 | grad norm: 0.591 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 13900 unjoint_samples 13900 joint_samples 42 [58896, 1047059]
processed_samples 13900 unjoint_samples 13900 joint_samples 41 [1046955, 473432]
processed_samples 13900 unjoint_samples 13900 joint_samples 41 [1046955, 473432]
processed_samples 13900 unjoint_samples 13900 joint_samples 42 [58896, 1047059]
processed_samples 13900 unjoint_samples 13900 joint_samples 41 [332452, 1046118]
processed_samples 13900 unjoint_samples 13900 joint_samples 41 [409555, 1047710]
processed_samples 13900 unjoint_samples 13900 joint_samples 42 [1046810, 900611]
processed_samples 13900 unjoint_samples 13900 joint_samples 42 [619416, 1045284]
processed_samples 13900 unjoint_samples 13900 joint_samples 41 [1042745, 1045463]
processed_samples 13900 unjoint_samples 13900 joint_samples 41 [963526, 1038113]
processed_samples 13900 unjoint_samples 13900 joint_samples 41 [332452, 1046118]
processed_samples 13900 unjoint_samples 13900 joint_samples 41 [409555, 1047710]
processed_samples 13900 unjoint_samples 13900 joint_samples 42 [619416, 1045284]
processed_samples 13900 unjoint_samples 13900 joint_samples 42 [1046810, 900611]
processed_samples 13900 unjoint_samples 13900 joint_samples 41 [963526, 1038113]
processed_samples 13900 unjoint_samples 13900 joint_samples 41 [1042745, 1045463]
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
 [2024-11-30 18:44:43] iteration      332/     500 | consumed samples:         2656 | elapsed time per iteration (ms): 600467.4 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 1.413050E-06 | global batch size:     8 | lm loss: 8.575032E-01 | loss scale: 1.0 | grad norm: 0.612 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
 [2024-11-30 18:53:03] iteration      333/     500 | consumed samples:         2664 | elapsed time per iteration (ms): 500926.0 | throughput per GPU (TFLOP/s/GPU): 103.9 | learning rate: 1.399016E-06 | global batch size:     8 | lm loss: 8.795822E-01 | loss scale: 1.0 | grad norm: 0.679 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
 [2024-11-30 19:03:49] iteration      334/     500 | consumed samples:         2672 | elapsed time per iteration (ms): 645587.3 | throughput per GPU (TFLOP/s/GPU): 80.6 | learning rate: 1.385031E-06 | global batch size:     8 | lm loss: 7.920018E-01 | loss scale: 1.0 | grad norm: 0.643 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
processed_samples 14000 unjoint_samples 14000 joint_samples 42 [1005724, 285725]
[h264 @ 0x55d5141f5f40] mmco: unref short failure
processed_samples 14000 unjoint_samples 14000 joint_samples 42 [423876, 1047059]
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
processed_samples 14000 unjoint_samples 14000 joint_samples 41 [1046955, 890045]
processed_samples 14000 unjoint_samples 14000 joint_samples 43 [153836, 1025384]
processed_samples 14000 unjoint_samples 14000 joint_samples 41 [779490, 1047710]
processed_samples 14000 unjoint_samples 14000 joint_samples 42 [248842, 1045463]
processed_samples 14000 unjoint_samples 14000 joint_samples 41 [592321, 1046118]
processed_samples 14000 unjoint_samples 14000 joint_samples 42 [1005724, 285725]
processed_samples 14000 unjoint_samples 14000 joint_samples 42 [924084, 1045284]
processed_samples 14000 unjoint_samples 14000 joint_samples 42 [423876, 1047059]
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 14000 unjoint_samples 14000 joint_samples 41 [1046955, 890045]
processed_samples 14000 unjoint_samples 14000 joint_samples 41 [779490, 1047710]
processed_samples 14000 unjoint_samples 14000 joint_samples 43 [153836, 1025384]
processed_samples 14000 unjoint_samples 14000 joint_samples 42 [248842, 1045463]
processed_samples 14000 unjoint_samples 14000 joint_samples 41 [592321, 1046118]
processed_samples 14000 unjoint_samples 14000 joint_samples 42 [924084, 1045284]
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d512f1c480] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
 [2024-11-30 19:12:18] iteration      335/     500 | consumed samples:         2680 | elapsed time per iteration (ms): 508795.7 | throughput per GPU (TFLOP/s/GPU): 102.3 | learning rate: 1.371094E-06 | global batch size:     8 | lm loss: 8.916619E-01 | loss scale: 1.0 | grad norm: 0.626 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
 [2024-11-30 19:20:23] iteration      336/     500 | consumed samples:         2688 | elapsed time per iteration (ms): 484938.8 | throughput per GPU (TFLOP/s/GPU): 107.3 | learning rate: 1.357207E-06 | global batch size:     8 | lm loss: 8.181082E-01 | loss scale: 1.0 | grad norm: 0.827 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
 [2024-11-30 19:30:54] iteration      337/     500 | consumed samples:         2696 | elapsed time per iteration (ms): 631047.4 | throughput per GPU (TFLOP/s/GPU): 82.5 | learning rate: 1.343370E-06 | global batch size:     8 | lm loss: 7.870709E-01 | loss scale: 1.0 | grad norm: 0.616 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5145cc380] mmco: unref short failure
processed_samples 14100 unjoint_samples 14100 joint_samples 42 [1046955, 141152]
processed_samples 14100 unjoint_samples 14100 joint_samples 42 [1046955, 141152]
processed_samples 14100 unjoint_samples 14100 joint_samples 42 [36720, 1047710]
processed_samples 14100 unjoint_samples 14100 joint_samples 42 [36720, 1047710]
processed_samples 14100 unjoint_samples 14100 joint_samples 43 [183984, 1045284]
processed_samples 14100 unjoint_samples 14100 joint_samples 43 [183984, 1045284]
processed_samples 14100 unjoint_samples 14100 joint_samples 42 [1005724, 662471]
processed_samples 14100 unjoint_samples 14100 joint_samples 42 [1005724, 662471]
[h264 @ 0x55d5139fff40] mmco: unref short failure
processed_samples 14100 unjoint_samples 14100 joint_samples 42 [882020, 1047059]
[h264 @ 0x56215b89c080] mmco: unref short failure
processed_samples 14100 unjoint_samples 14100 joint_samples 42 [882020, 1047059]
processed_samples 14100 unjoint_samples 14100 joint_samples 41 [920704, 1046118]
processed_samples 14100 unjoint_samples 14100 joint_samples 43 [520287, 1025384]
processed_samples 14100 unjoint_samples 14100 joint_samples 43 [520287, 1025384]
processed_samples 14100 unjoint_samples 14100 joint_samples 42 [552508, 1045463]
processed_samples 14100 unjoint_samples 14100 joint_samples 42 [552508, 1045463]
processed_samples 14100 unjoint_samples 14100 joint_samples 41 [920704, 1046118]
[h264 @ 0x55d516fefc40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
 [2024-11-30 19:40:45] iteration      338/     500 | consumed samples:         2704 | elapsed time per iteration (ms): 591547.6 | throughput per GPU (TFLOP/s/GPU): 88.0 | learning rate: 1.329584E-06 | global batch size:     8 | lm loss: 7.846169E-01 | loss scale: 1.0 | grad norm: 0.579 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
 [2024-11-30 19:49:58] iteration      339/     500 | consumed samples:         2712 | elapsed time per iteration (ms): 552124.6 | throughput per GPU (TFLOP/s/GPU): 94.3 | learning rate: 1.315849E-06 | global batch size:     8 | lm loss: 7.516593E-01 | loss scale: 1.0 | grad norm: 0.736 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
processed_samples 14200 unjoint_samples 14200 joint_samples 42 [159121, 1046118]
processed_samples 14200 unjoint_samples 14200 joint_samples 42 [159121, 1046118]
processed_samples 14200 unjoint_samples 14200 joint_samples 43 [131062, 1047059]
processed_samples 14200 unjoint_samples 14200 joint_samples 42 [434399, 1047710]
processed_samples 14200 unjoint_samples 14200 joint_samples 43 [131062, 1047059]
processed_samples 14200 unjoint_samples 14200 joint_samples 42 [434399, 1047710]
processed_samples 14200 unjoint_samples 14200 joint_samples 43 [453173, 1045284]
processed_samples 14200 unjoint_samples 14200 joint_samples 43 [453173, 1045284]
processed_samples 14200 unjoint_samples 14200 joint_samples 42 [1046955, 515088]
processed_samples 14200 unjoint_samples 14200 joint_samples 42 [1046955, 515088]
processed_samples 14200 unjoint_samples 14200 joint_samples 43 [919180, 1025384]
processed_samples 14200 unjoint_samples 14200 joint_samples 43 [919180, 1025384]
processed_samples 14200 unjoint_samples 14200 joint_samples 42 [1005724, 943910]
processed_samples 14200 unjoint_samples 14200 joint_samples 42 [1005724, 943910]
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
processed_samples 14200 unjoint_samples 14200 joint_samples 42 [892352, 1045463]
processed_samples 14200 unjoint_samples 14200 joint_samples 42 [892352, 1045463]
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215ba43980] mmco: unref short failure
[h264 @ 0x56215ba43980] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215ba43980] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
 [2024-11-30 20:00:32] iteration      340/     500 | consumed samples:         2720 | elapsed time per iteration (ms): 634723.9 | throughput per GPU (TFLOP/s/GPU): 82.0 | learning rate: 1.302166E-06 | global batch size:     8 | lm loss: 7.638261E-01 | loss scale: 1.0 | grad norm: 0.654 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (263589.57, 263590.48)
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
 [2024-11-30 20:12:11] iteration      341/     500 | consumed samples:         2728 | elapsed time per iteration (ms): 435338.3 | throughput per GPU (TFLOP/s/GPU): 119.5 | learning rate: 1.288535E-06 | global batch size:     8 | lm loss: 8.274179E-01 | loss scale: 1.0 | grad norm: 0.595 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
 [2024-11-30 20:22:15] iteration      342/     500 | consumed samples:         2736 | elapsed time per iteration (ms): 603582.7 | throughput per GPU (TFLOP/s/GPU): 86.2 | learning rate: 1.274957E-06 | global batch size:     8 | lm loss: 8.112563E-01 | loss scale: 1.0 | grad norm: 0.836 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
processed_samples 14300 unjoint_samples 14300 joint_samples 44 [202728, 1042201]
processed_samples 14300 unjoint_samples 14300 joint_samples 43 [76695, 1047559]
processed_samples 14300 unjoint_samples 14300 joint_samples 44 [202728, 1042201]
processed_samples 14300 unjoint_samples 14300 joint_samples 43 [1043636, 197466]
processed_samples 14300 unjoint_samples 14300 joint_samples 43 [76695, 1047559]
processed_samples 14300 unjoint_samples 14300 joint_samples 42 [527394, 1046118]
processed_samples 14300 unjoint_samples 14300 joint_samples 43 [1043636, 197466]
processed_samples 14300 unjoint_samples 14300 joint_samples 42 [1046955, 813807]
processed_samples 14300 unjoint_samples 14300 joint_samples 43 [389192, 1047059]
processed_samples 14300 unjoint_samples 14300 joint_samples 42 [527394, 1046118]
processed_samples 14300 unjoint_samples 14300 joint_samples 43 [389192, 1047059]
processed_samples 14300 unjoint_samples 14300 joint_samples 42 [839538, 1047710]
processed_samples 14300 unjoint_samples 14300 joint_samples 42 [839538, 1047710]
processed_samples 14300 unjoint_samples 14300 joint_samples 42 [1046955, 813807]
processed_samples 14300 unjoint_samples 14300 joint_samples 43 [767251, 1045284]
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 14300 unjoint_samples 14300 joint_samples 43 [767251, 1045284]
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
 [2024-11-30 20:31:53] iteration      343/     500 | consumed samples:         2744 | elapsed time per iteration (ms): 578439.8 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 1.261432E-06 | global batch size:     8 | lm loss: 8.647650E-01 | loss scale: 1.0 | grad norm: 0.580 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
 [2024-11-30 20:40:44] iteration      344/     500 | consumed samples:         2752 | elapsed time per iteration (ms): 530650.1 | throughput per GPU (TFLOP/s/GPU): 98.1 | learning rate: 1.247961E-06 | global batch size:     8 | lm loss: 8.212827E-01 | loss scale: 1.0 | grad norm: 0.617 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
 [2024-11-30 20:49:18] iteration      345/     500 | consumed samples:         2760 | elapsed time per iteration (ms): 513944.1 | throughput per GPU (TFLOP/s/GPU): 101.3 | learning rate: 1.234546E-06 | global batch size:     8 | lm loss: 8.558559E-01 | loss scale: 1.0 | grad norm: 0.776 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
processed_samples 14400 unjoint_samples 14400 joint_samples 43 [1047213, 59291]
processed_samples 14400 unjoint_samples 14400 joint_samples 43 [110012, 1030169]
processed_samples 14400 unjoint_samples 14400 joint_samples 44 [84987, 1046776]
processed_samples 14400 unjoint_samples 14400 joint_samples 43 [1047213, 59291]
processed_samples 14400 unjoint_samples 14400 joint_samples 43 [1043636, 451790]
processed_samples 14400 unjoint_samples 14400 joint_samples 43 [110012, 1030169]
processed_samples 14400 unjoint_samples 14400 joint_samples 44 [578833, 1042201]
processed_samples 14400 unjoint_samples 14400 joint_samples 44 [84987, 1046776]
processed_samples 14400 unjoint_samples 14400 joint_samples 43 [690881, 1047059]
processed_samples 14400 unjoint_samples 14400 joint_samples 43 [349469, 1047559]
processed_samples 14400 unjoint_samples 14400 joint_samples 42 [930189, 1046118]
processed_samples 14400 unjoint_samples 14400 joint_samples 44 [578833, 1042201]
processed_samples 14400 unjoint_samples 14400 joint_samples 43 [1043636, 451790]
processed_samples 14400 unjoint_samples 14400 joint_samples 43 [690881, 1047059]
processed_samples 14400 unjoint_samples 14400 joint_samples 43 [349469, 1047559]
processed_samples 14400 unjoint_samples 14400 joint_samples 42 [930189, 1046118]
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
 [2024-11-30 21:00:13] iteration      346/     500 | consumed samples:         2768 | elapsed time per iteration (ms): 655458.5 | throughput per GPU (TFLOP/s/GPU): 79.4 | learning rate: 1.221185E-06 | global batch size:     8 | lm loss: 8.408645E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d517059440] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d517059440] mmco: unref short failure
[h264 @ 0x55d517059440] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1047213, 493755]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1047213, 493755]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [156327, 1048242]
processed_samples 14500 unjoint_samples 14500 joint_samples 44 [428707, 1046776]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [351813, 1030169]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [156327, 1048242]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1022921, 1047059]
processed_samples 14500 unjoint_samples 14500 joint_samples 44 [428707, 1046776]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [351813, 1030169]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1022921, 1047059]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1043636, 755528]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [649191, 1047559]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1043636, 755528]
processed_samples 14500 unjoint_samples 14500 joint_samples 44 [926515, 1042201]
processed_samples 14500 unjoint_samples 14500 joint_samples 44 [926515, 1042201]
processed_samples 14500 unjoint_samples 14500 joint_samples 43 [649191, 1047559]
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
 [2024-11-30 21:09:08] iteration      347/     500 | consumed samples:         2776 | elapsed time per iteration (ms): 534217.7 | throughput per GPU (TFLOP/s/GPU): 97.4 | learning rate: 1.207880E-06 | global batch size:     8 | lm loss: 8.221813E-01 | loss scale: 1.0 | grad norm: 0.547 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215de57ac0] mmco: unref short failure
[h264 @ 0x56215de57ac0] mmco: unref short failure
[h264 @ 0x56215de57ac0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
 [2024-11-30 21:19:18] iteration      348/     500 | consumed samples:         2784 | elapsed time per iteration (ms): 610608.1 | throughput per GPU (TFLOP/s/GPU): 85.2 | learning rate: 1.194631E-06 | global batch size:     8 | lm loss: 7.808754E-01 | loss scale: 1.0 | grad norm: 0.618 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
processed_samples 14600 unjoint_samples 14600 joint_samples 44 [1046605, 187775]
processed_samples 14600 unjoint_samples 14600 joint_samples 43 [468598, 1048242]
processed_samples 14600 unjoint_samples 14600 joint_samples 45 [122465, 1046598]
processed_samples 14600 unjoint_samples 14600 joint_samples 43 [1047213, 762387]
processed_samples 14600 unjoint_samples 14600 joint_samples 43 [957133, 1047559]
processed_samples 14600 unjoint_samples 14600 joint_samples 44 [716648, 1046776]
processed_samples 14600 unjoint_samples 14600 joint_samples 43 [668285, 1030169]
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
processed_samples 14600 unjoint_samples 14600 joint_samples 43 [1043636, 1024674]
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
processed_samples 14600 unjoint_samples 14600 joint_samples 43 [468598, 1048242]
processed_samples 14600 unjoint_samples 14600 joint_samples 44 [1046605, 187775]
processed_samples 14600 unjoint_samples 14600 joint_samples 45 [122465, 1046598]
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
processed_samples 14600 unjoint_samples 14600 joint_samples 43 [1047213, 762387]
[h264 @ 0x55d513a64780] mmco: unref short failure
processed_samples 14600 unjoint_samples 14600 joint_samples 44 [716648, 1046776]
processed_samples 14600 unjoint_samples 14600 joint_samples 43 [668285, 1030169]
processed_samples 14600 unjoint_samples 14600 joint_samples 43 [1043636, 1024674]
processed_samples 14600 unjoint_samples 14600 joint_samples 43 [957133, 1047559]
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
 [2024-11-30 21:28:44] iteration      349/     500 | consumed samples:         2792 | elapsed time per iteration (ms): 565549.1 | throughput per GPU (TFLOP/s/GPU): 92.0 | learning rate: 1.181440E-06 | global batch size:     8 | lm loss: 8.365794E-01 | loss scale: 1.0 | grad norm: 0.707 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
 [2024-11-30 21:39:35] iteration      350/     500 | consumed samples:         2800 | elapsed time per iteration (ms): 651391.3 | throughput per GPU (TFLOP/s/GPU): 79.9 | learning rate: 1.168305E-06 | global batch size:     8 | lm loss: 8.073776E-01 | loss scale: 1.0 | grad norm: 0.757 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
 [2024-11-30 21:48:46] iteration      351/     500 | consumed samples:         2808 | elapsed time per iteration (ms): 550324.4 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 1.155229E-06 | global batch size:     8 | lm loss: 8.848139E-01 | loss scale: 1.0 | grad norm: 1.070 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
 [2024-11-30 21:57:55] iteration      352/     500 | consumed samples:         2816 | elapsed time per iteration (ms): 549263.8 | throughput per GPU (TFLOP/s/GPU): 94.7 | learning rate: 1.142211E-06 | global batch size:     8 | lm loss: 8.049350E-01 | loss scale: 1.0 | grad norm: 0.612 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
processed_samples 14700 unjoint_samples 14700 joint_samples 44 [11089, 1046570]
processed_samples 14700 unjoint_samples 14700 joint_samples 44 [11089, 1046570]
processed_samples 14700 unjoint_samples 14700 joint_samples 45 [1039184, 79494]
processed_samples 14700 unjoint_samples 14700 joint_samples 45 [1039184, 79494]
processed_samples 14700 unjoint_samples 14700 joint_samples 44 [335570, 1047559]
processed_samples 14700 unjoint_samples 14700 joint_samples 44 [1046605, 412179]
processed_samples 14700 unjoint_samples 14700 joint_samples 45 [477050, 1046598]
processed_samples 14700 unjoint_samples 14700 joint_samples 44 [335570, 1047559]
processed_samples 14700 unjoint_samples 14700 joint_samples 45 [477050, 1046598]
processed_samples 14700 unjoint_samples 14700 joint_samples 44 [1046605, 412179]
processed_samples 14700 unjoint_samples 14700 joint_samples 44 [1047404, 344796]
processed_samples 14700 unjoint_samples 14700 joint_samples 44 [1047404, 344796]
processed_samples 14700 unjoint_samples 14700 joint_samples 43 [788256, 1048242]
processed_samples 14700 unjoint_samples 14700 joint_samples 43 [788256, 1048242]
processed_samples 14700 unjoint_samples 14700 joint_samples 43 [992955, 1030169]
processed_samples 14700 unjoint_samples 14700 joint_samples 43 [992955, 1030169]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
 [2024-11-30 22:06:26] iteration      353/     500 | consumed samples:         2824 | elapsed time per iteration (ms): 511600.3 | throughput per GPU (TFLOP/s/GPU): 101.7 | learning rate: 1.129252E-06 | global batch size:     8 | lm loss: 8.550798E-01 | loss scale: 1.0 | grad norm: 0.775 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
 [2024-11-30 22:17:02] iteration      354/     500 | consumed samples:         2832 | elapsed time per iteration (ms): 635950.4 | throughput per GPU (TFLOP/s/GPU): 81.8 | learning rate: 1.116353E-06 | global batch size:     8 | lm loss: 8.067714E-01 | loss scale: 1.0 | grad norm: 0.676 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d512d490c0] mmco: unref short failure
[h264 @ 0x55d512d490c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d512d490c0] mmco: unref short failure
[h264 @ 0x55d512d490c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215c0ec900] mmco: unref short failure
[h264 @ 0x56215c0ec900] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d516ea6140] mmco: unref short failure
[h264 @ 0x56215c3b4780] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
processed_samples 14800 unjoint_samples 14800 joint_samples 45 [723060, 1046598]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [92635, 1048242]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [239637, 1047448]
processed_samples 14800 unjoint_samples 14800 joint_samples 45 [723060, 1046598]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [239637, 1047448]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [92635, 1048242]
processed_samples 14800 unjoint_samples 14800 joint_samples 45 [1039184, 429369]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [633469, 1047559]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [329384, 1046570]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [329384, 1046570]
processed_samples 14800 unjoint_samples 14800 joint_samples 45 [1039184, 429369]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [1047404, 622276]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [633469, 1047559]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [1046605, 710576]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [1047404, 622276]
processed_samples 14800 unjoint_samples 14800 joint_samples 44 [1046605, 710576]
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-11-30 22:27:58] iteration      355/     500 | consumed samples:         2840 | elapsed time per iteration (ms): 656031.8 | throughput per GPU (TFLOP/s/GPU): 79.3 | learning rate: 1.103514E-06 | global batch size:     8 | lm loss: 8.307241E-01 | loss scale: 1.0 | grad norm: 0.578 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [533642, 1047448]
processed_samples 14900 unjoint_samples 14900 joint_samples 46 [46039, 1046659]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [980286, 1047559]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [610685, 1046570]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [533642, 1047448]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [339028, 1048242]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [1046605, 990142]
processed_samples 14900 unjoint_samples 14900 joint_samples 45 [1039184, 814639]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [980286, 1047559]
processed_samples 14900 unjoint_samples 14900 joint_samples 46 [46039, 1046659]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [1047404, 983297]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [1047404, 983297]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [339028, 1048242]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [610685, 1046570]
processed_samples 14900 unjoint_samples 14900 joint_samples 44 [1046605, 990142]
processed_samples 14900 unjoint_samples 14900 joint_samples 45 [1039184, 814639]
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
 [2024-11-30 22:36:51] iteration      356/     500 | consumed samples:         2848 | elapsed time per iteration (ms): 532141.4 | throughput per GPU (TFLOP/s/GPU): 97.8 | learning rate: 1.090736E-06 | global batch size:     8 | lm loss: 8.175163E-01 | loss scale: 1.0 | grad norm: 0.677 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-11-30 22:46:26] iteration      357/     500 | consumed samples:         2856 | elapsed time per iteration (ms): 575283.9 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 1.078019E-06 | global batch size:     8 | lm loss: 8.372152E-01 | loss scale: 1.0 | grad norm: 0.692 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
 [2024-11-30 22:58:32] iteration      358/     500 | consumed samples:         2864 | elapsed time per iteration (ms): 725842.3 | throughput per GPU (TFLOP/s/GPU): 71.7 | learning rate: 1.065363E-06 | global batch size:     8 | lm loss: 8.198099E-01 | loss scale: 1.0 | grad norm: 0.548 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
 [2024-11-30 23:06:27] iteration      359/     500 | consumed samples:         2872 | elapsed time per iteration (ms): 475503.6 | throughput per GPU (TFLOP/s/GPU): 109.4 | learning rate: 1.052770E-06 | global batch size:     8 | lm loss: 8.032740E-01 | loss scale: 1.0 | grad norm: 0.604 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
processed_samples 15000 unjoint_samples 15000 joint_samples 46 [67811, 1044908]
processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1047404, 204719]
processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1046605, 194912]
processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1026421, 275715]
[h264 @ 0x56215cbc4280] mmco: unref short failure
processed_samples 15000 unjoint_samples 15000 joint_samples 44 [696351, 1048242]
processed_samples 15000 unjoint_samples 15000 joint_samples 46 [299825, 1046659]
processed_samples 15000 unjoint_samples 15000 joint_samples 44 [939138, 1047448]
processed_samples 15000 unjoint_samples 15000 joint_samples 44 [936282, 1046570]
processed_samples 15000 unjoint_samples 15000 joint_samples 46 [299825, 1046659]
processed_samples 15000 unjoint_samples 15000 joint_samples 44 [696351, 1048242]
processed_samples 15000 unjoint_samples 15000 joint_samples 46 [67811, 1044908]
processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1046605, 194912]
processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1047404, 204719]
processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1026421, 275715]
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 15000 unjoint_samples 15000 joint_samples 44 [936282, 1046570]
processed_samples 15000 unjoint_samples 15000 joint_samples 44 [939138, 1047448]
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
 [2024-11-30 23:14:58] iteration      360/     500 | consumed samples:         2880 | elapsed time per iteration (ms): 511060.0 | throughput per GPU (TFLOP/s/GPU): 101.8 | learning rate: 1.040240E-06 | global batch size:     8 | lm loss: 8.227824E-01 | loss scale: 1.0 | grad norm: 0.751 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (257612.46, 257612.88)
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
 [2024-11-30 23:26:27] iteration      361/     500 | consumed samples:         2888 | elapsed time per iteration (ms): 430482.0 | throughput per GPU (TFLOP/s/GPU): 120.9 | learning rate: 1.027773E-06 | global batch size:     8 | lm loss: 8.282033E-01 | loss scale: 1.0 | grad norm: 0.768 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
processed_samples 15100 unjoint_samples 15100 joint_samples 45 [161612, 1047448]
processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1042748, 177357]
processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1042748, 177357]
processed_samples 15100 unjoint_samples 15100 joint_samples 45 [161612, 1047448]
processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1046605, 525556]
processed_samples 15100 unjoint_samples 15100 joint_samples 46 [429458, 1044908]
processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1046605, 525556]
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
processed_samples 15100 unjoint_samples 15100 joint_samples 46 [429458, 1044908]
processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1026421, 554426]
processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1026421, 554426]
processed_samples 15100 unjoint_samples 15100 joint_samples 46 [643541, 1046659]
processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1047404, 507909]
processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1047404, 507909]
processed_samples 15100 unjoint_samples 15100 joint_samples 46 [643541, 1046659]
processed_samples 15100 unjoint_samples 15100 joint_samples 44 [984374, 1048242]
processed_samples 15100 unjoint_samples 15100 joint_samples 44 [984374, 1048242]
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
 [2024-11-30 23:35:06] iteration      362/     500 | consumed samples:         2896 | elapsed time per iteration (ms): 519023.1 | throughput per GPU (TFLOP/s/GPU): 100.3 | learning rate: 1.015370E-06 | global batch size:     8 | lm loss: 8.181028E-01 | loss scale: 1.0 | grad norm: 0.796 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [407349, 1047448]
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1048022, 204247]
processed_samples 15200 unjoint_samples 15200 joint_samples 46 [646524, 1044908]
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1026421, 901311]
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1042748, 568415]
processed_samples 15200 unjoint_samples 15200 joint_samples 46 [911276, 1046659]
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1046605, 868111]
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1047404, 759171]
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [407349, 1047448]
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1048022, 204247]
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1042748, 568415]
processed_samples 15200 unjoint_samples 15200 joint_samples 46 [911276, 1046659]
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1026421, 901311]
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1046605, 868111]
processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1047404, 759171]
processed_samples 15200 unjoint_samples 15200 joint_samples 46 [646524, 1044908]
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d517d67ec0] mmco: unref short failure
[h264 @ 0x55d517d67ec0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-11-30 23:47:32] iteration      363/     500 | consumed samples:         2904 | elapsed time per iteration (ms): 745984.1 | throughput per GPU (TFLOP/s/GPU): 69.8 | learning rate: 1.003032E-06 | global batch size:     8 | lm loss: 7.587072E-01 | loss scale: 1.0 | grad norm: 0.632 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
 [2024-11-30 23:58:28] iteration      364/     500 | consumed samples:         2912 | elapsed time per iteration (ms): 656084.8 | throughput per GPU (TFLOP/s/GPU): 79.3 | learning rate: 9.907581E-07 | global batch size:     8 | lm loss: 8.105970E-01 | loss scale: 1.0 | grad norm: 0.607 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 00:08:00] iteration      365/     500 | consumed samples:         2920 | elapsed time per iteration (ms): 571895.2 | throughput per GPU (TFLOP/s/GPU): 91.0 | learning rate: 9.785499E-07 | global batch size:     8 | lm loss: 8.919947E-01 | loss scale: 1.0 | grad norm: 0.746 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
processed_samples 15300 unjoint_samples 15300 joint_samples 46 [1046605, 90877]
processed_samples 15300 unjoint_samples 15300 joint_samples 45 [642808, 1047448]
processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1048022, 581832]
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
processed_samples 15300 unjoint_samples 15300 joint_samples 47 [103812, 1046659]
[h264 @ 0x55d51478a440] mmco: unref short failure
processed_samples 15300 unjoint_samples 15300 joint_samples 46 [96183, 1043822]
processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1047404, 1034632]
processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1042748, 830319]
[h264 @ 0x55d5130aa740] mmco: unref short failure
processed_samples 15300 unjoint_samples 15300 joint_samples 46 [1046605, 90877]
processed_samples 15300 unjoint_samples 15300 joint_samples 46 [940276, 1044908]
[h264 @ 0x56215d4974c0] mmco: unref short failure
processed_samples 15300 unjoint_samples 15300 joint_samples 45 [642808, 1047448]
processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1048022, 581832]
processed_samples 15300 unjoint_samples 15300 joint_samples 46 [96183, 1043822]
processed_samples 15300 unjoint_samples 15300 joint_samples 47 [103812, 1046659]
processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1042748, 830319]
processed_samples 15300 unjoint_samples 15300 joint_samples 46 [940276, 1044908]
processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1047404, 1034632]
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
 [2024-12-01 00:16:26] iteration      366/     500 | consumed samples:         2928 | elapsed time per iteration (ms): 506663.6 | throughput per GPU (TFLOP/s/GPU): 102.7 | learning rate: 9.664075E-07 | global batch size:     8 | lm loss: 8.039944E-01 | loss scale: 1.0 | grad norm: 0.731 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
 [2024-12-01 00:25:33] iteration      367/     500 | consumed samples:         2936 | elapsed time per iteration (ms): 546381.3 | throughput per GPU (TFLOP/s/GPU): 95.2 | learning rate: 9.543316E-07 | global batch size:     8 | lm loss: 7.765344E-01 | loss scale: 1.0 | grad norm: 0.548 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
 [2024-12-01 00:33:11] iteration      368/     500 | consumed samples:         2944 | elapsed time per iteration (ms): 458843.2 | throughput per GPU (TFLOP/s/GPU): 113.4 | learning rate: 9.423227E-07 | global batch size:     8 | lm loss: 7.737644E-01 | loss scale: 1.0 | grad norm: 4.278 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 15400 unjoint_samples 15400 joint_samples 46 [379375, 1043822]
processed_samples 15400 unjoint_samples 15400 joint_samples 45 [1048022, 1045809]
processed_samples 15400 unjoint_samples 15400 joint_samples 46 [1047411, 28274]
processed_samples 15400 unjoint_samples 15400 joint_samples 47 [1047529, 151070]
processed_samples 15400 unjoint_samples 15400 joint_samples 46 [263010, 1047821]
processed_samples 15400 unjoint_samples 15400 joint_samples 45 [994220, 1047448]
processed_samples 15400 unjoint_samples 15400 joint_samples 47 [467656, 1046659]
processed_samples 15400 unjoint_samples 15400 joint_samples 45 [1048022, 1045809]
processed_samples 15400 unjoint_samples 15400 joint_samples 46 [379375, 1043822]
processed_samples 15400 unjoint_samples 15400 joint_samples 46 [1047411, 28274]
processed_samples 15400 unjoint_samples 15400 joint_samples 47 [1047529, 151070]
processed_samples 15400 unjoint_samples 15400 joint_samples 47 [467656, 1046659]
processed_samples 15400 unjoint_samples 15400 joint_samples 46 [263010, 1047821]
processed_samples 15400 unjoint_samples 15400 joint_samples 46 [1046605, 386686]
processed_samples 15400 unjoint_samples 15400 joint_samples 46 [1046605, 386686]
processed_samples 15400 unjoint_samples 15400 joint_samples 45 [994220, 1047448]
 [2024-12-01 00:41:17] iteration      369/     500 | consumed samples:         2952 | elapsed time per iteration (ms): 485235.8 | throughput per GPU (TFLOP/s/GPU): 107.2 | learning rate: 9.303812E-07 | global batch size:     8 | lm loss: 7.778735E-01 | loss scale: 1.0 | grad norm: 0.625 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
 [2024-12-01 00:50:04] iteration      370/     500 | consumed samples:         2960 | elapsed time per iteration (ms): 526954.5 | throughput per GPU (TFLOP/s/GPU): 98.8 | learning rate: 9.185077E-07 | global batch size:     8 | lm loss: 8.354955E-01 | loss scale: 1.0 | grad norm: 0.739 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215eca3f80] mmco: unref short failure
[h264 @ 0x56215eca3f80] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1047411, 345187]
processed_samples 15500 unjoint_samples 15500 joint_samples 47 [1047529, 476934]
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [404461, 1045809]
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1046514, 227331]
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [564845, 1047821]
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1046605, 706854]
processed_samples 15500 unjoint_samples 15500 joint_samples 47 [784848, 1046659]
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [653970, 1043822]
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [404461, 1045809]
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [564845, 1047821]
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1047411, 345187]
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1046514, 227331]
processed_samples 15500 unjoint_samples 15500 joint_samples 47 [1047529, 476934]
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1046605, 706854]
processed_samples 15500 unjoint_samples 15500 joint_samples 46 [653970, 1043822]
processed_samples 15500 unjoint_samples 15500 joint_samples 47 [784848, 1046659]
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x55d51459dd80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d51459dd80] mmco: unref short failure
[h264 @ 0x55d51459dd80] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51459dd80] mmco: unref short failure
[h264 @ 0x55d51459dd80] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
 [2024-12-01 01:02:18] iteration      371/     500 | consumed samples:         2968 | elapsed time per iteration (ms): 734588.2 | throughput per GPU (TFLOP/s/GPU): 70.8 | learning rate: 9.067026E-07 | global batch size:     8 | lm loss: 8.195457E-01 | loss scale: 1.0 | grad norm: 0.665 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 01:15:02] iteration      372/     500 | consumed samples:         2976 | elapsed time per iteration (ms): 763327.3 | throughput per GPU (TFLOP/s/GPU): 68.2 | learning rate: 8.949665E-07 | global batch size:     8 | lm loss: 7.728306E-01 | loss scale: 1.0 | grad norm: 0.628 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1046514, 489869]
processed_samples 15600 unjoint_samples 15600 joint_samples 46 [728191, 1045809]
processed_samples 15600 unjoint_samples 15600 joint_samples 47 [15386, 1048015]
processed_samples 15600 unjoint_samples 15600 joint_samples 48 [18458, 1046659]
processed_samples 15600 unjoint_samples 15600 joint_samples 47 [1047529, 849609]
processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1046514, 489869]
processed_samples 15600 unjoint_samples 15600 joint_samples 47 [15386, 1048015]
processed_samples 15600 unjoint_samples 15600 joint_samples 46 [728191, 1045809]
processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1047411, 713963]
processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1046605, 1016691]
processed_samples 15600 unjoint_samples 15600 joint_samples 48 [18458, 1046659]
processed_samples 15600 unjoint_samples 15600 joint_samples 46 [818786, 1047821]
[h264 @ 0x55d513a4c280] mmco: unref short failure
processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1047411, 713963]
processed_samples 15600 unjoint_samples 15600 joint_samples 46 [818786, 1047821]
[h264 @ 0x56215d4974c0] mmco: unref short failure
processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1046605, 1016691]
processed_samples 15600 unjoint_samples 15600 joint_samples 47 [1047529, 849609]
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
 [2024-12-01 01:25:36] iteration      373/     500 | consumed samples:         2984 | elapsed time per iteration (ms): 634333.7 | throughput per GPU (TFLOP/s/GPU): 82.0 | learning rate: 8.832998E-07 | global batch size:     8 | lm loss: 8.350987E-01 | loss scale: 1.0 | grad norm: 0.681 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d513a4c280] mmco: unref short failure
[h264 @ 0x55d513a4c280] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
 [2024-12-01 01:37:07] iteration      374/     500 | consumed samples:         2992 | elapsed time per iteration (ms): 691168.5 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 8.717031E-07 | global batch size:     8 | lm loss: 7.817224E-01 | loss scale: 1.0 | grad norm: 0.547 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
 [2024-12-01 01:44:03] iteration      375/     500 | consumed samples:         3000 | elapsed time per iteration (ms): 415366.5 | throughput per GPU (TFLOP/s/GPU): 125.3 | learning rate: 8.601767E-07 | global batch size:     8 | lm loss: 7.525834E-01 | loss scale: 1.0 | grad norm: 0.954 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1046514, 751262]
processed_samples 15700 unjoint_samples 15700 joint_samples 48 [1047529, 257143]
processed_samples 15700 unjoint_samples 15700 joint_samples 47 [1041522, 131458]
processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1047411, 1004159]
processed_samples 15700 unjoint_samples 15700 joint_samples 47 [1046605, 275662]
processed_samples 15700 unjoint_samples 15700 joint_samples 47 [284312, 1048015]
processed_samples 15700 unjoint_samples 15700 joint_samples 48 [247417, 1046659]
processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1013183, 1045809]
processed_samples 15700 unjoint_samples 15700 joint_samples 48 [1047529, 257143]
processed_samples 15700 unjoint_samples 15700 joint_samples 47 [1041522, 131458]
processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1046514, 751262]
processed_samples 15700 unjoint_samples 15700 joint_samples 47 [284312, 1048015]
processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1047411, 1004159]
processed_samples 15700 unjoint_samples 15700 joint_samples 47 [1046605, 275662]
processed_samples 15700 unjoint_samples 15700 joint_samples 48 [247417, 1046659]
processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1013183, 1045809]
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
 [2024-12-01 01:53:10] iteration      376/     500 | consumed samples:         3008 | elapsed time per iteration (ms): 547870.2 | throughput per GPU (TFLOP/s/GPU): 95.0 | learning rate: 8.487213E-07 | global batch size:     8 | lm loss: 8.182274E-01 | loss scale: 1.0 | grad norm: 0.607 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
 [2024-12-01 02:02:13] iteration      377/     500 | consumed samples:         3016 | elapsed time per iteration (ms): 542559.7 | throughput per GPU (TFLOP/s/GPU): 95.9 | learning rate: 8.373373E-07 | global batch size:     8 | lm loss: 8.583971E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215e692c80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d513a4c280] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1042287, 326785]
processed_samples 15800 unjoint_samples 15800 joint_samples 48 [1047529, 653291]
processed_samples 15800 unjoint_samples 15800 joint_samples 47 [648506, 1048015]
processed_samples 15800 unjoint_samples 15800 joint_samples 47 [254576, 1046923]
processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1041522, 525114]
processed_samples 15800 unjoint_samples 15800 joint_samples 46 [1046514, 1026555]
processed_samples 15800 unjoint_samples 15800 joint_samples 48 [619260, 1046659]
processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1042287, 326785]
processed_samples 15800 unjoint_samples 15800 joint_samples 48 [1047529, 653291]
processed_samples 15800 unjoint_samples 15800 joint_samples 47 [648506, 1048015]
processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1046605, 680037]
processed_samples 15800 unjoint_samples 15800 joint_samples 47 [254576, 1046923]
processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1041522, 525114]
processed_samples 15800 unjoint_samples 15800 joint_samples 48 [619260, 1046659]
processed_samples 15800 unjoint_samples 15800 joint_samples 46 [1046514, 1026555]
processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1046605, 680037]
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
 [2024-12-01 02:11:31] iteration      378/     500 | consumed samples:         3024 | elapsed time per iteration (ms): 557982.8 | throughput per GPU (TFLOP/s/GPU): 93.3 | learning rate: 8.260251E-07 | global batch size:     8 | lm loss: 8.015327E-01 | loss scale: 1.0 | grad norm: 0.774 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513a4c280] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516effb00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215bc88940] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
 [2024-12-01 02:25:34] iteration      379/     500 | consumed samples:         3032 | elapsed time per iteration (ms): 843400.3 | throughput per GPU (TFLOP/s/GPU): 61.7 | learning rate: 8.147852E-07 | global batch size:     8 | lm loss: 7.978020E-01 | loss scale: 1.0 | grad norm: 1.030 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 15900 unjoint_samples 15900 joint_samples 48 [1046605, 83978]
processed_samples 15900 unjoint_samples 15900 joint_samples 48 [1046605, 83978]
processed_samples 15900 unjoint_samples 15900 joint_samples 47 [1041522, 860416]
processed_samples 15900 unjoint_samples 15900 joint_samples 47 [244074, 1048371]
processed_samples 15900 unjoint_samples 15900 joint_samples 47 [244074, 1048371]
processed_samples 15900 unjoint_samples 15900 joint_samples 47 [545936, 1046923]
processed_samples 15900 unjoint_samples 15900 joint_samples 48 [1047529, 936904]
processed_samples 15900 unjoint_samples 15900 joint_samples 47 [996543, 1048015]
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
processed_samples 15900 unjoint_samples 15900 joint_samples 47 [996543, 1048015]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 15900 unjoint_samples 15900 joint_samples 47 [1042287, 724518]
processed_samples 15900 unjoint_samples 15900 joint_samples 48 [897266, 1046659]
processed_samples 15900 unjoint_samples 15900 joint_samples 47 [1042287, 724518]
processed_samples 15900 unjoint_samples 15900 joint_samples 47 [545936, 1046923]
[h264 @ 0x56215b35e3c0] mmco: unref short failure
processed_samples 15900 unjoint_samples 15900 joint_samples 48 [897266, 1046659]
processed_samples 15900 unjoint_samples 15900 joint_samples 47 [1041522, 860416]
processed_samples 15900 unjoint_samples 15900 joint_samples 48 [1047529, 936904]
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
 [2024-12-01 02:33:19] iteration      380/     500 | consumed samples:         3040 | elapsed time per iteration (ms): 464339.8 | throughput per GPU (TFLOP/s/GPU): 112.1 | learning rate: 8.036182E-07 | global batch size:     8 | lm loss: 8.628017E-01 | loss scale: 1.0 | grad norm: 0.700 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (150869.38, 150869.98)
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
 [2024-12-01 02:45:31] iteration      381/     500 | consumed samples:         3048 | elapsed time per iteration (ms): 581575.6 | throughput per GPU (TFLOP/s/GPU): 89.5 | learning rate: 7.925244E-07 | global batch size:     8 | lm loss: 8.163093E-01 | loss scale: 1.0 | grad norm: 1.139 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
 [2024-12-01 02:55:19] iteration      382/     500 | consumed samples:         3056 | elapsed time per iteration (ms): 587749.5 | throughput per GPU (TFLOP/s/GPU): 88.5 | learning rate: 7.815044E-07 | global batch size:     8 | lm loss: 7.799031E-01 | loss scale: 1.0 | grad norm: 0.672 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x55d51863a080] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
 [2024-12-01 03:05:34] iteration      383/     500 | consumed samples:         3064 | elapsed time per iteration (ms): 614570.1 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 7.705586E-07 | global batch size:     8 | lm loss: 7.895235E-01 | loss scale: 1.0 | grad norm: 0.646 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
processed_samples 16000 unjoint_samples 16000 joint_samples 49 [222990, 1046734]
processed_samples 16000 unjoint_samples 16000 joint_samples 48 [52873, 1046859]
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
processed_samples 16000 unjoint_samples 16000 joint_samples 48 [1046605, 351559]
processed_samples 16000 unjoint_samples 16000 joint_samples 47 [491351, 1048371]
processed_samples 16000 unjoint_samples 16000 joint_samples 49 [222990, 1046734]
processed_samples 16000 unjoint_samples 16000 joint_samples 48 [52873, 1046859]
processed_samples 16000 unjoint_samples 16000 joint_samples 48 [1046605, 351559]
processed_samples 16000 unjoint_samples 16000 joint_samples 47 [491351, 1048371]
processed_samples 16000 unjoint_samples 16000 joint_samples 48 [8070, 1046923]
processed_samples 16000 unjoint_samples 16000 joint_samples 48 [8070, 1046923]
 [2024-12-01 03:13:47] iteration      384/     500 | consumed samples:         3072 | elapsed time per iteration (ms): 493563.5 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 7.596874E-07 | global batch size:     8 | lm loss: 8.024371E-01 | loss scale: 1.0 | grad norm: 0.695 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 16000 unjoint_samples 16000 joint_samples 49 [201258, 1046659]
processed_samples 16000 unjoint_samples 16000 joint_samples 49 [201258, 1046659]
processed_samples 16000 unjoint_samples 16000 joint_samples 48 [207561, 1048015]
processed_samples 16000 unjoint_samples 16000 joint_samples 48 [207561, 1048015]
processed_samples 16000 unjoint_samples 16000 joint_samples 47 [1042287, 1031405]
processed_samples 16000 unjoint_samples 16000 joint_samples 47 [1042287, 1031405]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d513a1f200] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
 [2024-12-01 03:22:55] iteration      385/     500 | consumed samples:         3080 | elapsed time per iteration (ms): 547368.2 | throughput per GPU (TFLOP/s/GPU): 95.1 | learning rate: 7.488913E-07 | global batch size:     8 | lm loss: 8.267174E-01 | loss scale: 1.0 | grad norm: 0.742 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x55d5137029c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
processed_samples 16100 unjoint_samples 16100 joint_samples 48 [395433, 1040248]
processed_samples 16100 unjoint_samples 16100 joint_samples 48 [369698, 1046859]
[h264 @ 0x56215be9ff00] mmco: unref short failure
processed_samples 16100 unjoint_samples 16100 joint_samples 48 [504219, 1048015]
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
processed_samples 16100 unjoint_samples 16100 joint_samples 48 [343406, 1046923]
processed_samples 16100 unjoint_samples 16100 joint_samples 48 [1046605, 639047]
processed_samples 16100 unjoint_samples 16100 joint_samples 49 [540322, 1046734]
processed_samples 16100 unjoint_samples 16100 joint_samples 47 [809565, 1048371]
processed_samples 16100 unjoint_samples 16100 joint_samples 49 [486512, 1046659]
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 16100 unjoint_samples 16100 joint_samples 48 [395433, 1040248]
processed_samples 16100 unjoint_samples 16100 joint_samples 49 [540322, 1046734]
processed_samples 16100 unjoint_samples 16100 joint_samples 48 [369698, 1046859]
processed_samples 16100 unjoint_samples 16100 joint_samples 48 [343406, 1046923]
processed_samples 16100 unjoint_samples 16100 joint_samples 48 [504219, 1048015]
processed_samples 16100 unjoint_samples 16100 joint_samples 49 [486512, 1046659]
processed_samples 16100 unjoint_samples 16100 joint_samples 48 [1046605, 639047]
processed_samples 16100 unjoint_samples 16100 joint_samples 47 [809565, 1048371]
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
 [2024-12-01 03:35:28] iteration      386/     500 | consumed samples:         3088 | elapsed time per iteration (ms): 753179.4 | throughput per GPU (TFLOP/s/GPU): 69.1 | learning rate: 7.381709E-07 | global batch size:     8 | lm loss: 8.138526E-01 | loss scale: 1.0 | grad norm: 0.833 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51327b000] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [1046838, 36849]
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [696758, 1046923]
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [1046838, 36849]
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [715740, 1046859]
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [1046605, 893554]
processed_samples 16200 unjoint_samples 16200 joint_samples 49 [753503, 1046734]
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [696758, 1046923]
processed_samples 16200 unjoint_samples 16200 joint_samples 49 [765154, 1046659]
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [715740, 1046859]
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 16200 unjoint_samples 16200 joint_samples 49 [753503, 1046734]
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [864313, 1048015]
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [764185, 1040248]
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [1046605, 893554]
[h264 @ 0x56215b4e5f40] mmco: unref short failure
processed_samples 16200 unjoint_samples 16200 joint_samples 49 [765154, 1046659]
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [864313, 1048015]
processed_samples 16200 unjoint_samples 16200 joint_samples 48 [764185, 1040248]
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
 [2024-12-01 03:48:38] iteration      387/     500 | consumed samples:         3096 | elapsed time per iteration (ms): 790397.8 | throughput per GPU (TFLOP/s/GPU): 65.8 | learning rate: 7.275264E-07 | global batch size:     8 | lm loss: 7.640258E-01 | loss scale: 1.0 | grad norm: 0.539 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
 [2024-12-01 03:59:48] iteration      388/     500 | consumed samples:         3104 | elapsed time per iteration (ms): 670250.3 | throughput per GPU (TFLOP/s/GPU): 77.6 | learning rate: 7.169584E-07 | global batch size:     8 | lm loss: 7.533755E-01 | loss scale: 1.0 | grad norm: 0.731 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
 [2024-12-01 04:10:39] iteration      389/     500 | consumed samples:         3112 | elapsed time per iteration (ms): 650075.4 | throughput per GPU (TFLOP/s/GPU): 80.1 | learning rate: 7.064673E-07 | global batch size:     8 | lm loss: 8.156929E-01 | loss scale: 1.0 | grad norm: 0.596 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215e692c80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
 [2024-12-01 04:19:38] iteration      390/     500 | consumed samples:         3120 | elapsed time per iteration (ms): 538977.8 | throughput per GPU (TFLOP/s/GPU): 96.6 | learning rate: 6.960536E-07 | global batch size:     8 | lm loss: 7.825867E-01 | loss scale: 1.0 | grad norm: 0.666 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 16300 unjoint_samples 16300 joint_samples 50 [1038849, 107087]
processed_samples 16300 unjoint_samples 16300 joint_samples 49 [1046605, 224095]
processed_samples 16300 unjoint_samples 16300 joint_samples 50 [19272, 1047822]
processed_samples 16300 unjoint_samples 16300 joint_samples 49 [1009126, 149600]
processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1046838, 447430]
processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1021208, 1046859]
processed_samples 16300 unjoint_samples 16300 joint_samples 48 [960290, 1046923]
processed_samples 16300 unjoint_samples 16300 joint_samples 49 [1046605, 224095]
processed_samples 16300 unjoint_samples 16300 joint_samples 50 [1038849, 107087]
processed_samples 16300 unjoint_samples 16300 joint_samples 48 [960290, 1046923]
processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1029841, 1040248]
processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1046838, 447430]
processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1021208, 1046859]
processed_samples 16300 unjoint_samples 16300 joint_samples 50 [19272, 1047822]
processed_samples 16300 unjoint_samples 16300 joint_samples 49 [1009126, 149600]
[h264 @ 0x55d51735fc80] mmco: unref short failure
processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1029841, 1040248]
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
 [2024-12-01 04:28:39] iteration      391/     500 | consumed samples:         3128 | elapsed time per iteration (ms): 541262.2 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 6.857177E-07 | global batch size:     8 | lm loss: 8.480158E-01 | loss scale: 1.0 | grad norm: 0.797 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 04:37:30] iteration      392/     500 | consumed samples:         3136 | elapsed time per iteration (ms): 531034.6 | throughput per GPU (TFLOP/s/GPU): 98.0 | learning rate: 6.754599E-07 | global batch size:     8 | lm loss: 8.135203E-01 | loss scale: 1.0 | grad norm: 0.631 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 04:46:42] iteration      393/     500 | consumed samples:         3144 | elapsed time per iteration (ms): 552274.7 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 6.652809E-07 | global batch size:     8 | lm loss: 8.489851E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
processed_samples 16400 unjoint_samples 16400 joint_samples 49 [313617, 1046859]
processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046605, 499525]
processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046237, 241240]
processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1009126, 434271]
processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046562, 221364]
processed_samples 16400 unjoint_samples 16400 joint_samples 50 [1038849, 521646]
[h264 @ 0x56215b6ebf40] mmco: unref short failure
processed_samples 16400 unjoint_samples 16400 joint_samples 50 [343985, 1047822]
processed_samples 16400 unjoint_samples 16400 joint_samples 48 [1046838, 743582]
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046605, 499525]
processed_samples 16400 unjoint_samples 16400 joint_samples 49 [313617, 1046859]
processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046237, 241240]
processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046562, 221364]
processed_samples 16400 unjoint_samples 16400 joint_samples 50 [1038849, 521646]
[h264 @ 0x55d513904400] mmco: unref short failure
processed_samples 16400 unjoint_samples 16400 joint_samples 50 [343985, 1047822]
processed_samples 16400 unjoint_samples 16400 joint_samples 48 [1046838, 743582]
processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1009126, 434271]
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215d200880] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51719ca00] mmco: unref short failure
[h264 @ 0x55d51719ca00] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51719ca00] mmco: unref short failure
[h264 @ 0x55d51719ca00] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
 [2024-12-01 04:58:04] iteration      394/     500 | consumed samples:         3152 | elapsed time per iteration (ms): 681448.5 | throughput per GPU (TFLOP/s/GPU): 76.4 | learning rate: 6.551809E-07 | global batch size:     8 | lm loss: 8.453261E-01 | loss scale: 1.0 | grad norm: 0.673 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1047034, 28245]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046605, 892360]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1047034, 28245]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046237, 457346]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046237, 457346]
processed_samples 16500 unjoint_samples 16500 joint_samples 50 [748263, 1047822]
processed_samples 16500 unjoint_samples 16500 joint_samples 50 [1038849, 827441]
processed_samples 16500 unjoint_samples 16500 joint_samples 50 [748263, 1047822]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046562, 465836]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046562, 465836]
processed_samples 16500 unjoint_samples 16500 joint_samples 50 [1038849, 827441]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046605, 892360]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1009126, 707858]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1009126, 707858]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [639624, 1046859]
processed_samples 16500 unjoint_samples 16500 joint_samples 49 [639624, 1046859]
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x5621619e5680] mmco: unref short failure
[h264 @ 0x5621619e5680] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-12-01 05:10:22] iteration      395/     500 | consumed samples:         3160 | elapsed time per iteration (ms): 738851.2 | throughput per GPU (TFLOP/s/GPU): 70.4 | learning rate: 6.451604E-07 | global batch size:     8 | lm loss: 8.372583E-01 | loss scale: 1.0 | grad norm: 0.717 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
 [2024-12-01 05:21:54] iteration      396/     500 | consumed samples:         3168 | elapsed time per iteration (ms): 691548.7 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 6.352198E-07 | global batch size:     8 | lm loss: 8.565010E-01 | loss scale: 1.0 | grad norm: 0.588 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
 [2024-12-01 05:32:33] iteration      397/     500 | consumed samples:         3176 | elapsed time per iteration (ms): 639376.8 | throughput per GPU (TFLOP/s/GPU): 81.4 | learning rate: 6.253596E-07 | global batch size:     8 | lm loss: 7.959479E-01 | loss scale: 1.0 | grad norm: 0.604 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
processed_samples 16600 unjoint_samples 16600 joint_samples 51 [1043499, 26749]
processed_samples 16600 unjoint_samples 16600 joint_samples 51 [970568, 335378]
processed_samples 16600 unjoint_samples 16600 joint_samples 50 [146901, 1047612]
processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1047034, 271818]
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
processed_samples 16600 unjoint_samples 16600 joint_samples 49 [871191, 1046859]
processed_samples 16600 unjoint_samples 16600 joint_samples 50 [146901, 1047612]
processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1047034, 271818]
processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1009126, 997896]
processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1046562, 709072]
processed_samples 16600 unjoint_samples 16600 joint_samples 51 [970568, 335378]
processed_samples 16600 unjoint_samples 16600 joint_samples 51 [1043499, 26749]
processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1046562, 709072]
processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1046237, 776619]
processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1046237, 776619]
processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1009126, 997896]
processed_samples 16600 unjoint_samples 16600 joint_samples 49 [871191, 1046859]
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
 [2024-12-01 05:41:49] iteration      398/     500 | consumed samples:         3184 | elapsed time per iteration (ms): 555342.2 | throughput per GPU (TFLOP/s/GPU): 93.7 | learning rate: 6.155801E-07 | global batch size:     8 | lm loss: 8.679034E-01 | loss scale: 1.0 | grad norm: 1.141 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
 [2024-12-01 05:53:35] iteration      399/     500 | consumed samples:         3192 | elapsed time per iteration (ms): 706572.2 | throughput per GPU (TFLOP/s/GPU): 73.7 | learning rate: 6.058818E-07 | global batch size:     8 | lm loss: 8.082343E-01 | loss scale: 1.0 | grad norm: 1.042 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
processed_samples 16700 unjoint_samples 16700 joint_samples 51 [1043499, 491958]
processed_samples 16700 unjoint_samples 16700 joint_samples 50 [926586, 345655]
[h264 @ 0x55d51a586540] mmco: unref short failure
processed_samples 16700 unjoint_samples 16700 joint_samples 50 [369808, 1046503]
processed_samples 16700 unjoint_samples 16700 joint_samples 50 [512500, 1047612]
processed_samples 16700 unjoint_samples 16700 joint_samples 51 [970568, 643165]
processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1046562, 995829]
processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1047034, 566436]
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1046237, 1038259]
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
processed_samples 16700 unjoint_samples 16700 joint_samples 50 [926586, 345655]
processed_samples 16700 unjoint_samples 16700 joint_samples 50 [512500, 1047612]
processed_samples 16700 unjoint_samples 16700 joint_samples 51 [1043499, 491958]
processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1047034, 566436]
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 16700 unjoint_samples 16700 joint_samples 50 [369808, 1046503]
processed_samples 16700 unjoint_samples 16700 joint_samples 51 [970568, 643165]
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1046562, 995829]
processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1046237, 1038259]
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
 [2024-12-01 06:02:17] iteration      400/     500 | consumed samples:         3200 | elapsed time per iteration (ms): 521421.3 | throughput per GPU (TFLOP/s/GPU): 99.8 | learning rate: 5.962651E-07 | global batch size:     8 | lm loss: 8.783824E-01 | loss scale: 1.0 | grad norm: 0.670 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (156572.19, 156572.53)
 [2024-12-01 06:13:08] iteration      401/     500 | consumed samples:         3208 | elapsed time per iteration (ms): 494537.3 | throughput per GPU (TFLOP/s/GPU): 105.2 | learning rate: 5.867304E-07 | global batch size:     8 | lm loss: 7.950339E-01 | loss scale: 1.0 | grad norm: 0.636 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
processed_samples 16800 unjoint_samples 16800 joint_samples 50 [926586, 644707]
processed_samples 16800 unjoint_samples 16800 joint_samples 50 [808536, 1047612]
processed_samples 16800 unjoint_samples 16800 joint_samples 50 [1046237, 284440]
processed_samples 16800 unjoint_samples 16800 joint_samples 50 [1046562, 244148]
processed_samples 16800 unjoint_samples 16800 joint_samples 50 [705796, 1046503]
processed_samples 16800 unjoint_samples 16800 joint_samples 51 [970568, 896207]
processed_samples 16800 unjoint_samples 16800 joint_samples 49 [1047034, 859904]
processed_samples 16800 unjoint_samples 16800 joint_samples 51 [1043499, 732131]
[h264 @ 0x55d5145a1340] mmco: unref short failure
processed_samples 16800 unjoint_samples 16800 joint_samples 50 [926586, 644707]
processed_samples 16800 unjoint_samples 16800 joint_samples 50 [808536, 1047612]
processed_samples 16800 unjoint_samples 16800 joint_samples 50 [705796, 1046503]
processed_samples 16800 unjoint_samples 16800 joint_samples 50 [1046562, 244148]
processed_samples 16800 unjoint_samples 16800 joint_samples 50 [1046237, 284440]
processed_samples 16800 unjoint_samples 16800 joint_samples 51 [970568, 896207]
processed_samples 16800 unjoint_samples 16800 joint_samples 49 [1047034, 859904]
[h264 @ 0x55d517193e00] mmco: unref short failure
processed_samples 16800 unjoint_samples 16800 joint_samples 51 [1043499, 732131]
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
 [2024-12-01 06:25:09] iteration      402/     500 | consumed samples:         3216 | elapsed time per iteration (ms): 720630.2 | throughput per GPU (TFLOP/s/GPU): 72.2 | learning rate: 5.772780E-07 | global batch size:     8 | lm loss: 8.785825E-01 | loss scale: 1.0 | grad norm: 0.656 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
 [2024-12-01 06:38:12] iteration      403/     500 | consumed samples:         3224 | elapsed time per iteration (ms): 783034.3 | throughput per GPU (TFLOP/s/GPU): 66.5 | learning rate: 5.679084E-07 | global batch size:     8 | lm loss: 7.533233E-01 | loss scale: 1.0 | grad norm: 1.009 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 06:47:27] iteration      404/     500 | consumed samples:         3232 | elapsed time per iteration (ms): 555287.0 | throughput per GPU (TFLOP/s/GPU): 93.7 | learning rate: 5.586219E-07 | global batch size:     8 | lm loss: 7.910761E-01 | loss scale: 1.0 | grad norm: 0.566 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 06:58:44] iteration      405/     500 | consumed samples:         3240 | elapsed time per iteration (ms): 677427.3 | throughput per GPU (TFLOP/s/GPU): 76.8 | learning rate: 5.494190E-07 | global batch size:     8 | lm loss: 8.007294E-01 | loss scale: 1.0 | grad norm: 0.787 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
processed_samples 16900 unjoint_samples 16900 joint_samples 52 [95173, 1043931]
processed_samples 16900 unjoint_samples 16900 joint_samples 51 [1046364, 46299]
processed_samples 16900 unjoint_samples 16900 joint_samples 52 [1046496, 89101]
processed_samples 16900 unjoint_samples 16900 joint_samples 50 [1046237, 563939]
processed_samples 16900 unjoint_samples 16900 joint_samples 50 [1046562, 593415]
processed_samples 16900 unjoint_samples 16900 joint_samples 51 [1046364, 46299]
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
processed_samples 16900 unjoint_samples 16900 joint_samples 50 [153945, 1030463]
[h264 @ 0x55d51a586540] mmco: unref short failure
processed_samples 16900 unjoint_samples 16900 joint_samples 50 [987809, 994613]
processed_samples 16900 unjoint_samples 16900 joint_samples 50 [153945, 1030463]
processed_samples 16900 unjoint_samples 16900 joint_samples 50 [983868, 1046503]
processed_samples 16900 unjoint_samples 16900 joint_samples 50 [1046237, 563939]
processed_samples 16900 unjoint_samples 16900 joint_samples 52 [95173, 1043931]
processed_samples 16900 unjoint_samples 16900 joint_samples 52 [1046496, 89101]
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
processed_samples 16900 unjoint_samples 16900 joint_samples 50 [1046562, 593415]
processed_samples 16900 unjoint_samples 16900 joint_samples 50 [983868, 1046503]
processed_samples 16900 unjoint_samples 16900 joint_samples 50 [987809, 994613]
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
 [2024-12-01 07:10:25] iteration      406/     500 | consumed samples:         3248 | elapsed time per iteration (ms): 701132.9 | throughput per GPU (TFLOP/s/GPU): 74.2 | learning rate: 5.403001E-07 | global batch size:     8 | lm loss: 8.500883E-01 | loss scale: 1.0 | grad norm: 0.643 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
 [2024-12-01 07:19:37] iteration      407/     500 | consumed samples:         3256 | elapsed time per iteration (ms): 551624.2 | throughput per GPU (TFLOP/s/GPU): 94.3 | learning rate: 5.312654E-07 | global batch size:     8 | lm loss: 8.234965E-01 | loss scale: 1.0 | grad norm: 0.589 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d5133c8980] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
processed_samples 17000 unjoint_samples 17000 joint_samples 52 [1046496, 356297]
processed_samples 17000 unjoint_samples 17000 joint_samples 50 [451472, 1030463]
processed_samples 17000 unjoint_samples 17000 joint_samples 51 [1046539, 136773]
processed_samples 17000 unjoint_samples 17000 joint_samples 51 [203326, 1046503]
processed_samples 17000 unjoint_samples 17000 joint_samples 52 [343107, 1043931]
processed_samples 17000 unjoint_samples 17000 joint_samples 51 [1046364, 514149]
processed_samples 17000 unjoint_samples 17000 joint_samples 50 [1046562, 948074]
[h264 @ 0x56215cc254c0] mmco: unref short failure
processed_samples 17000 unjoint_samples 17000 joint_samples 50 [1046237, 898074]
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
processed_samples 17000 unjoint_samples 17000 joint_samples 51 [1046539, 136773]
processed_samples 17000 unjoint_samples 17000 joint_samples 52 [1046496, 356297]
processed_samples 17000 unjoint_samples 17000 joint_samples 51 [203326, 1046503]
processed_samples 17000 unjoint_samples 17000 joint_samples 52 [343107, 1043931]
processed_samples 17000 unjoint_samples 17000 joint_samples 51 [1046364, 514149]
processed_samples 17000 unjoint_samples 17000 joint_samples 50 [1046237, 898074]
processed_samples 17000 unjoint_samples 17000 joint_samples 50 [451472, 1030463]
processed_samples 17000 unjoint_samples 17000 joint_samples 50 [1046562, 948074]
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215d072240] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-12-01 07:29:57] iteration      408/     500 | consumed samples:         3264 | elapsed time per iteration (ms): 619408.4 | throughput per GPU (TFLOP/s/GPU): 84.0 | learning rate: 5.223155E-07 | global batch size:     8 | lm loss: 8.258877E-01 | loss scale: 1.0 | grad norm: 0.616 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
 [2024-12-01 07:39:06] iteration      409/     500 | consumed samples:         3272 | elapsed time per iteration (ms): 549850.1 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 5.134507E-07 | global batch size:     8 | lm loss: 8.401117E-01 | loss scale: 1.0 | grad norm: 0.585 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 17100 unjoint_samples 17100 joint_samples 51 [1046539, 416250]
processed_samples 17100 unjoint_samples 17100 joint_samples 50 [691117, 1030463]
processed_samples 17100 unjoint_samples 17100 joint_samples 52 [650495, 1043931]
[h264 @ 0x55d5145a1340] mmco: unref short failure
processed_samples 17100 unjoint_samples 17100 joint_samples 51 [182361, 1046533]
processed_samples 17100 unjoint_samples 17100 joint_samples 51 [124216, 1043853]
processed_samples 17100 unjoint_samples 17100 joint_samples 51 [667616, 1046503]
processed_samples 17100 unjoint_samples 17100 joint_samples 51 [1046364, 772920]
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
processed_samples 17100 unjoint_samples 17100 joint_samples 52 [1046496, 739044]
[h264 @ 0x55d513765580] mmco: unref short failure
processed_samples 17100 unjoint_samples 17100 joint_samples 51 [1046539, 416250]
processed_samples 17100 unjoint_samples 17100 joint_samples 52 [650495, 1043931]
processed_samples 17100 unjoint_samples 17100 joint_samples 51 [124216, 1043853]
processed_samples 17100 unjoint_samples 17100 joint_samples 51 [182361, 1046533]
processed_samples 17100 unjoint_samples 17100 joint_samples 51 [667616, 1046503]
processed_samples 17100 unjoint_samples 17100 joint_samples 51 [1046364, 772920]
processed_samples 17100 unjoint_samples 17100 joint_samples 52 [1046496, 739044]
processed_samples 17100 unjoint_samples 17100 joint_samples 50 [691117, 1030463]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215f714500] mmco: unref short failure
[h264 @ 0x56215f714500] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
 [2024-12-01 07:49:41] iteration      410/     500 | consumed samples:         3280 | elapsed time per iteration (ms): 634908.6 | throughput per GPU (TFLOP/s/GPU): 82.0 | learning rate: 5.046713E-07 | global batch size:     8 | lm loss: 8.124939E-01 | loss scale: 1.0 | grad norm: 0.629 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 08:02:32] iteration      411/     500 | consumed samples:         3288 | elapsed time per iteration (ms): 770692.2 | throughput per GPU (TFLOP/s/GPU): 67.5 | learning rate: 4.959777E-07 | global batch size:     8 | lm loss: 8.272680E-01 | loss scale: 1.0 | grad norm: 0.704 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
processed_samples 17200 unjoint_samples 17200 joint_samples 53 [1046496, 42043]
processed_samples 17200 unjoint_samples 17200 joint_samples 52 [1046364, 81991]
processed_samples 17200 unjoint_samples 17200 joint_samples 50 [1022791, 1030463]
processed_samples 17200 unjoint_samples 17200 joint_samples 52 [1022425, 1043931]
processed_samples 17200 unjoint_samples 17200 joint_samples 51 [512052, 1043853]
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
processed_samples 17200 unjoint_samples 17200 joint_samples 51 [498091, 1046533]
processed_samples 17200 unjoint_samples 17200 joint_samples 51 [945808, 1046503]
processed_samples 17200 unjoint_samples 17200 joint_samples 51 [1046539, 847044]
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
processed_samples 17200 unjoint_samples 17200 joint_samples 52 [1046364, 81991]
processed_samples 17200 unjoint_samples 17200 joint_samples 53 [1046496, 42043]
processed_samples 17200 unjoint_samples 17200 joint_samples 51 [512052, 1043853]
processed_samples 17200 unjoint_samples 17200 joint_samples 51 [498091, 1046533]
processed_samples 17200 unjoint_samples 17200 joint_samples 51 [1046539, 847044]
processed_samples 17200 unjoint_samples 17200 joint_samples 52 [1022425, 1043931]
processed_samples 17200 unjoint_samples 17200 joint_samples 51 [945808, 1046503]
processed_samples 17200 unjoint_samples 17200 joint_samples 50 [1022791, 1030463]
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
 [2024-12-01 08:13:28] iteration      412/     500 | consumed samples:         3296 | elapsed time per iteration (ms): 656266.9 | throughput per GPU (TFLOP/s/GPU): 79.3 | learning rate: 4.873703E-07 | global batch size:     8 | lm loss: 8.331949E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
 [2024-12-01 08:21:45] iteration      413/     500 | consumed samples:         3304 | elapsed time per iteration (ms): 497136.0 | throughput per GPU (TFLOP/s/GPU): 104.7 | learning rate: 4.788494E-07 | global batch size:     8 | lm loss: 7.713867E-01 | loss scale: 1.0 | grad norm: 0.614 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
 [2024-12-01 08:32:02] iteration      414/     500 | consumed samples:         3312 | elapsed time per iteration (ms): 616735.0 | throughput per GPU (TFLOP/s/GPU): 84.4 | learning rate: 4.704155E-07 | global batch size:     8 | lm loss: 7.833863E-01 | loss scale: 1.0 | grad norm: 0.689 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
 [2024-12-01 08:41:12] iteration      415/     500 | consumed samples:         3320 | elapsed time per iteration (ms): 550190.2 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 4.620688E-07 | global batch size:     8 | lm loss: 7.809198E-01 | loss scale: 1.0 | grad norm: 0.677 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x55d51aa06d80] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
processed_samples 17300 unjoint_samples 17300 joint_samples 52 [254398, 1024488]
processed_samples 17300 unjoint_samples 17300 joint_samples 53 [1046496, 325096]
processed_samples 17300 unjoint_samples 17300 joint_samples 51 [1046874, 224911]
processed_samples 17300 unjoint_samples 17300 joint_samples 52 [1046254, 251953]
processed_samples 17300 unjoint_samples 17300 joint_samples 52 [1046364, 424804]
processed_samples 17300 unjoint_samples 17300 joint_samples 53 [1041886, 329927]
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
processed_samples 17300 unjoint_samples 17300 joint_samples 51 [957816, 1046533]
processed_samples 17300 unjoint_samples 17300 joint_samples 51 [944115, 1043853]
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
processed_samples 17300 unjoint_samples 17300 joint_samples 52 [254398, 1024488]
processed_samples 17300 unjoint_samples 17300 joint_samples 51 [1046874, 224911]
processed_samples 17300 unjoint_samples 17300 joint_samples 53 [1046496, 325096]
processed_samples 17300 unjoint_samples 17300 joint_samples 52 [1046254, 251953]
processed_samples 17300 unjoint_samples 17300 joint_samples 53 [1041886, 329927]
processed_samples 17300 unjoint_samples 17300 joint_samples 52 [1046364, 424804]
processed_samples 17300 unjoint_samples 17300 joint_samples 51 [957816, 1046533]
[h264 @ 0x55d5145a1340] mmco: unref short failure
processed_samples 17300 unjoint_samples 17300 joint_samples 51 [944115, 1043853]
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
 [2024-12-01 08:49:28] iteration      416/     500 | consumed samples:         3328 | elapsed time per iteration (ms): 495207.7 | throughput per GPU (TFLOP/s/GPU): 105.1 | learning rate: 4.538097E-07 | global batch size:     8 | lm loss: 8.112941E-01 | loss scale: 1.0 | grad norm: 0.704 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
 [2024-12-01 08:57:17] iteration      417/     500 | consumed samples:         3336 | elapsed time per iteration (ms): 469064.3 | throughput per GPU (TFLOP/s/GPU): 110.9 | learning rate: 4.456385E-07 | global batch size:     8 | lm loss: 7.586689E-01 | loss scale: 1.0 | grad norm: 0.548 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x56215b271600] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
processed_samples 17400 unjoint_samples 17400 joint_samples 51 [1046874, 616398]
processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1046364, 851077]
processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1047235, 363744]
processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1045065, 210144]
processed_samples 17400 unjoint_samples 17400 joint_samples 53 [1041886, 758627]
processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1046254, 758247]
processed_samples 17400 unjoint_samples 17400 joint_samples 53 [1046496, 614136]
processed_samples 17400 unjoint_samples 17400 joint_samples 52 [540046, 1024488]
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f714500] mmco: unref short failure
processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1047235, 363744]
processed_samples 17400 unjoint_samples 17400 joint_samples 51 [1046874, 616398]
processed_samples 17400 unjoint_samples 17400 joint_samples 52 [540046, 1024488]
processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1045065, 210144]
processed_samples 17400 unjoint_samples 17400 joint_samples 53 [1041886, 758627]
processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1046364, 851077]
processed_samples 17400 unjoint_samples 17400 joint_samples 53 [1046496, 614136]
processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1046254, 758247]
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
 [2024-12-01 09:07:39] iteration      418/     500 | consumed samples:         3344 | elapsed time per iteration (ms): 622731.4 | throughput per GPU (TFLOP/s/GPU): 83.6 | learning rate: 4.375557E-07 | global batch size:     8 | lm loss: 8.728430E-01 | loss scale: 1.0 | grad norm: 0.695 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x56215c627e80] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5168cfe40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
 [2024-12-01 09:18:24] iteration      419/     500 | consumed samples:         3352 | elapsed time per iteration (ms): 644159.3 | throughput per GPU (TFLOP/s/GPU): 80.8 | learning rate: 4.295615E-07 | global batch size:     8 | lm loss: 8.373601E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
 [2024-12-01 09:27:56] iteration      420/     500 | consumed samples:         3360 | elapsed time per iteration (ms): 572431.4 | throughput per GPU (TFLOP/s/GPU): 90.9 | learning rate: 4.216562E-07 | global batch size:     8 | lm loss: 7.823456E-01 | loss scale: 1.0 | grad norm: 0.607 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (259402.89, 259403.23)
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
processed_samples 17500 unjoint_samples 17500 joint_samples 54 [60450, 1019283]
processed_samples 17500 unjoint_samples 17500 joint_samples 52 [1047235, 700229]
processed_samples 17500 unjoint_samples 17500 joint_samples 53 [1046364, 138426]
processed_samples 17500 unjoint_samples 17500 joint_samples 53 [19392, 1046811]
processed_samples 17500 unjoint_samples 17500 joint_samples 52 [1045065, 565351]
processed_samples 17500 unjoint_samples 17500 joint_samples 52 [1047235, 700229]
processed_samples 17500 unjoint_samples 17500 joint_samples 54 [60450, 1019283]
processed_samples 17500 unjoint_samples 17500 joint_samples 52 [784006, 1024488]
processed_samples 17500 unjoint_samples 17500 joint_samples 53 [1046364, 138426]
processed_samples 17500 unjoint_samples 17500 joint_samples 51 [1046874, 929482]
processed_samples 17500 unjoint_samples 17500 joint_samples 52 [1045065, 565351]
processed_samples 17500 unjoint_samples 17500 joint_samples 52 [784006, 1024488]
processed_samples 17500 unjoint_samples 17500 joint_samples 53 [19392, 1046811]
processed_samples 17500 unjoint_samples 17500 joint_samples 53 [1046496, 962646]
processed_samples 17500 unjoint_samples 17500 joint_samples 53 [1046496, 962646]
processed_samples 17500 unjoint_samples 17500 joint_samples 51 [1046874, 929482]
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
 [2024-12-01 09:41:18] iteration      421/     500 | consumed samples:         3368 | elapsed time per iteration (ms): 542009.0 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 4.138403E-07 | global batch size:     8 | lm loss: 8.172854E-01 | loss scale: 1.0 | grad norm: 0.735 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
 [2024-12-01 09:51:58] iteration      422/     500 | consumed samples:         3376 | elapsed time per iteration (ms): 640585.4 | throughput per GPU (TFLOP/s/GPU): 81.2 | learning rate: 4.061140E-07 | global batch size:     8 | lm loss: 8.059546E-01 | loss scale: 1.0 | grad norm: 0.653 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f714500] mmco: unref short failure
[h264 @ 0x55d5168cfe40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1046874, 188377]
processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1045065, 800487]
processed_samples 17600 unjoint_samples 17600 joint_samples 53 [1046364, 427102]
processed_samples 17600 unjoint_samples 17600 joint_samples 54 [1046910, 200641]
processed_samples 17600 unjoint_samples 17600 joint_samples 53 [308994, 1046811]
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1002135, 1024488]
processed_samples 17600 unjoint_samples 17600 joint_samples 54 [437610, 1019283]
processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1046874, 188377]
processed_samples 17600 unjoint_samples 17600 joint_samples 54 [1046910, 200641]
processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1047235, 983555]
processed_samples 17600 unjoint_samples 17600 joint_samples 53 [308994, 1046811]
processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1045065, 800487]
processed_samples 17600 unjoint_samples 17600 joint_samples 53 [1046364, 427102]
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1047235, 983555]
processed_samples 17600 unjoint_samples 17600 joint_samples 54 [437610, 1019283]
processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1002135, 1024488]
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
 [2024-12-01 10:00:49] iteration      423/     500 | consumed samples:         3384 | elapsed time per iteration (ms): 530376.6 | throughput per GPU (TFLOP/s/GPU): 98.1 | learning rate: 3.984776E-07 | global batch size:     8 | lm loss: 8.025057E-01 | loss scale: 1.0 | grad norm: 1.568 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b928840] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
 [2024-12-01 10:10:12] iteration      424/     500 | consumed samples:         3392 | elapsed time per iteration (ms): 563357.7 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 3.909315E-07 | global batch size:     8 | lm loss: 8.046057E-01 | loss scale: 1.0 | grad norm: 0.828 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-12-01 10:18:09] iteration      425/     500 | consumed samples:         3400 | elapsed time per iteration (ms): 477347.5 | throughput per GPU (TFLOP/s/GPU): 109.0 | learning rate: 3.834760E-07 | global batch size:     8 | lm loss: 8.117014E-01 | loss scale: 1.0 | grad norm: 0.497 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215b70be40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
processed_samples 17700 unjoint_samples 17700 joint_samples 52 [1046874, 504009]
processed_samples 17700 unjoint_samples 17700 joint_samples 53 [1046364, 735773]
[h264 @ 0x56215b523440] mmco: unref short failure
processed_samples 17700 unjoint_samples 17700 joint_samples 53 [196340, 1047550]
processed_samples 17700 unjoint_samples 17700 joint_samples 53 [349109, 921903]
processed_samples 17700 unjoint_samples 17700 joint_samples 53 [220594, 1037240]
processed_samples 17700 unjoint_samples 17700 joint_samples 53 [700155, 1046811]
processed_samples 17700 unjoint_samples 17700 joint_samples 54 [1046910, 463930]
processed_samples 17700 unjoint_samples 17700 joint_samples 52 [1046874, 504009]
processed_samples 17700 unjoint_samples 17700 joint_samples 53 [1046364, 735773]
processed_samples 17700 unjoint_samples 17700 joint_samples 54 [1046910, 463930]
processed_samples 17700 unjoint_samples 17700 joint_samples 53 [196340, 1047550]
processed_samples 17700 unjoint_samples 17700 joint_samples 53 [220594, 1037240]
processed_samples 17700 unjoint_samples 17700 joint_samples 53 [349109, 921903]
processed_samples 17700 unjoint_samples 17700 joint_samples 54 [747400, 1019283]
processed_samples 17700 unjoint_samples 17700 joint_samples 53 [700155, 1046811]
processed_samples 17700 unjoint_samples 17700 joint_samples 54 [747400, 1019283]
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215b100780] mmco: unref short failure
[h264 @ 0x56215b100780] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215b100780] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d5178a9980] mmco: unref short failure
[h264 @ 0x55d5178a9980] mmco: unref short failure
[h264 @ 0x55d5178a9980] mmco: unref short failure
[h264 @ 0x55d5178a9980] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
processed_samples 17800 unjoint_samples 17800 joint_samples 54 [38127, 1039929]
processed_samples 17800 unjoint_samples 17800 joint_samples 53 [478442, 1037240]
processed_samples 17800 unjoint_samples 17800 joint_samples 52 [1046874, 851580]
processed_samples 17800 unjoint_samples 17800 joint_samples 53 [440619, 1047550]
processed_samples 17800 unjoint_samples 17800 joint_samples 54 [1046910, 980262]
processed_samples 17800 unjoint_samples 17800 joint_samples 53 [609333, 921903]
processed_samples 17800 unjoint_samples 17800 joint_samples 53 [1024371, 1046811]
processed_samples 17800 unjoint_samples 17800 joint_samples 54 [1014725, 1019283]
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
processed_samples 17800 unjoint_samples 17800 joint_samples 52 [1046874, 851580]
processed_samples 17800 unjoint_samples 17800 joint_samples 53 [440619, 1047550]
processed_samples 17800 unjoint_samples 17800 joint_samples 53 [478442, 1037240]
processed_samples 17800 unjoint_samples 17800 joint_samples 54 [1014725, 1019283]
processed_samples 17800 unjoint_samples 17800 joint_samples 53 [1024371, 1046811]
processed_samples 17800 unjoint_samples 17800 joint_samples 54 [1046910, 980262]
processed_samples 17800 unjoint_samples 17800 joint_samples 53 [609333, 921903]
processed_samples 17800 unjoint_samples 17800 joint_samples 54 [38127, 1039929]
 [2024-12-01 10:29:57] iteration      426/     500 | consumed samples:         3408 | elapsed time per iteration (ms): 708209.6 | throughput per GPU (TFLOP/s/GPU): 73.5 | learning rate: 3.761115E-07 | global batch size:     8 | lm loss: 7.919018E-01 | loss scale: 1.0 | grad norm: 0.506 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
 [2024-12-01 10:40:57] iteration      427/     500 | consumed samples:         3416 | elapsed time per iteration (ms): 659879.3 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 3.688381E-07 | global batch size:     8 | lm loss: 8.052567E-01 | loss scale: 1.0 | grad norm: 1.120 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d51327b000] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
 [2024-12-01 10:49:45] iteration      428/     500 | consumed samples:         3424 | elapsed time per iteration (ms): 527278.1 | throughput per GPU (TFLOP/s/GPU): 98.7 | learning rate: 3.616562E-07 | global batch size:     8 | lm loss: 8.389349E-01 | loss scale: 1.0 | grad norm: 0.741 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 10:57:42] iteration      429/     500 | consumed samples:         3432 | elapsed time per iteration (ms): 476971.2 | throughput per GPU (TFLOP/s/GPU): 109.1 | learning rate: 3.545662E-07 | global batch size:     8 | lm loss: 7.981853E-01 | loss scale: 1.0 | grad norm: 0.759 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
 [2024-12-01 11:07:54] iteration      430/     500 | consumed samples:         3440 | elapsed time per iteration (ms): 612005.4 | throughput per GPU (TFLOP/s/GPU): 85.0 | learning rate: 3.475682E-07 | global batch size:     8 | lm loss: 8.206917E-01 | loss scale: 1.0 | grad norm: 0.595 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x55d516c18800] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
processed_samples 17900 unjoint_samples 17900 joint_samples 55 [244587, 1041944]
processed_samples 17900 unjoint_samples 17900 joint_samples 53 [117114, 1043785]
processed_samples 17900 unjoint_samples 17900 joint_samples 53 [765583, 1047550]
processed_samples 17900 unjoint_samples 17900 joint_samples 54 [358824, 1039929]
[h264 @ 0x56215f6b5640] mmco: unref short failure
processed_samples 17900 unjoint_samples 17900 joint_samples 55 [1036890, 387692]
processed_samples 17900 unjoint_samples 17900 joint_samples 54 [220760, 1046811]
processed_samples 17900 unjoint_samples 17900 joint_samples 53 [784032, 1037240]
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 17900 unjoint_samples 17900 joint_samples 53 [997954, 937444]
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
processed_samples 17900 unjoint_samples 17900 joint_samples 55 [244587, 1041944]
processed_samples 17900 unjoint_samples 17900 joint_samples 53 [117114, 1043785]
[h264 @ 0x56215b75cc40] mmco: unref short failure
processed_samples 17900 unjoint_samples 17900 joint_samples 53 [765583, 1047550]
processed_samples 17900 unjoint_samples 17900 joint_samples 54 [358824, 1039929]
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 17900 unjoint_samples 17900 joint_samples 55 [1036890, 387692]
processed_samples 17900 unjoint_samples 17900 joint_samples 54 [220760, 1046811]
processed_samples 17900 unjoint_samples 17900 joint_samples 53 [784032, 1037240]
processed_samples 17900 unjoint_samples 17900 joint_samples 53 [997954, 937444]
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
 [2024-12-01 11:17:44] iteration      431/     500 | consumed samples:         3448 | elapsed time per iteration (ms): 590797.5 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 3.406627E-07 | global batch size:     8 | lm loss: 8.949883E-01 | loss scale: 1.0 | grad norm: 0.689 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c3b4780] mmco: unref short failure
[h264 @ 0x56215c3b4780] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
 [2024-12-01 11:27:48] iteration      432/     500 | consumed samples:         3456 | elapsed time per iteration (ms): 603113.4 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 3.338499E-07 | global batch size:     8 | lm loss: 7.817925E-01 | loss scale: 1.0 | grad norm: 0.671 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
processed_samples 18000 unjoint_samples 18000 joint_samples 53 [440767, 1043785]
[h264 @ 0x55d514594d00] mmco: unref short failure
processed_samples 18000 unjoint_samples 18000 joint_samples 54 [1037696, 157113]
processed_samples 18000 unjoint_samples 18000 joint_samples 54 [52930, 1047168]
processed_samples 18000 unjoint_samples 18000 joint_samples 55 [590070, 1041944]
processed_samples 18000 unjoint_samples 18000 joint_samples 54 [525706, 1046811]
processed_samples 18000 unjoint_samples 18000 joint_samples 55 [1036890, 750150]
processed_samples 18000 unjoint_samples 18000 joint_samples 54 [672333, 1039929]
processed_samples 18000 unjoint_samples 18000 joint_samples 53 [440767, 1043785]
processed_samples 18000 unjoint_samples 18000 joint_samples 54 [672333, 1039929]
processed_samples 18000 unjoint_samples 18000 joint_samples 54 [1037696, 157113]
processed_samples 18000 unjoint_samples 18000 joint_samples 54 [1047402, 4518]
processed_samples 18000 unjoint_samples 18000 joint_samples 54 [52930, 1047168]
processed_samples 18000 unjoint_samples 18000 joint_samples 54 [525706, 1046811]
processed_samples 18000 unjoint_samples 18000 joint_samples 55 [590070, 1041944]
processed_samples 18000 unjoint_samples 18000 joint_samples 55 [1036890, 750150]
processed_samples 18000 unjoint_samples 18000 joint_samples 54 [1047402, 4518]
 [2024-12-01 11:34:52] iteration      433/     500 | consumed samples:         3464 | elapsed time per iteration (ms): 423893.8 | throughput per GPU (TFLOP/s/GPU): 122.8 | learning rate: 3.271301E-07 | global batch size:     8 | lm loss: 7.689373E-01 | loss scale: 1.0 | grad norm: 0.666 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x55d5170b6cc0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x55d5168d06c0] mmco: unref short failure
[h264 @ 0x56215f714500] mmco: unref short failure
[h264 @ 0x56215f714500] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5169efdc0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f714500] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
processed_samples 18100 unjoint_samples 18100 joint_samples 54 [337102, 1047168]
processed_samples 18100 unjoint_samples 18100 joint_samples 54 [1047402, 331027]
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
processed_samples 18100 unjoint_samples 18100 joint_samples 54 [1037696, 404133]
processed_samples 18100 unjoint_samples 18100 joint_samples 54 [792931, 1046811]
[h264 @ 0x55d5149ada40] mmco: unref short failure
processed_samples 18100 unjoint_samples 18100 joint_samples 55 [1040475, 1040736]
processed_samples 18100 unjoint_samples 18100 joint_samples 55 [1045631, 3389]
processed_samples 18100 unjoint_samples 18100 joint_samples 55 [896076, 1041944]
processed_samples 18100 unjoint_samples 18100 joint_samples 53 [696959, 1043785]
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 18100 unjoint_samples 18100 joint_samples 53 [696959, 1043785]
processed_samples 18100 unjoint_samples 18100 joint_samples 54 [337102, 1047168]
processed_samples 18100 unjoint_samples 18100 joint_samples 54 [1047402, 331027]
processed_samples 18100 unjoint_samples 18100 joint_samples 54 [1037696, 404133]
processed_samples 18100 unjoint_samples 18100 joint_samples 55 [896076, 1041944]
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
processed_samples 18100 unjoint_samples 18100 joint_samples 55 [1040475, 1040736]
processed_samples 18100 unjoint_samples 18100 joint_samples 54 [792931, 1046811]
processed_samples 18100 unjoint_samples 18100 joint_samples 55 [1045631, 3389]
 [2024-12-01 11:45:05] iteration      434/     500 | consumed samples:         3472 | elapsed time per iteration (ms): 613843.4 | throughput per GPU (TFLOP/s/GPU): 84.8 | learning rate: 3.205035E-07 | global batch size:     8 | lm loss: 7.734962E-01 | loss scale: 1.0 | grad norm: 0.539 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
 [2024-12-01 11:56:07] iteration      435/     500 | consumed samples:         3480 | elapsed time per iteration (ms): 661397.0 | throughput per GPU (TFLOP/s/GPU): 78.7 | learning rate: 3.139705E-07 | global batch size:     8 | lm loss: 8.416315E-01 | loss scale: 1.0 | grad norm: 0.613 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215f4fa380] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-12-01 12:05:09] iteration      436/     500 | consumed samples:         3488 | elapsed time per iteration (ms): 542389.6 | throughput per GPU (TFLOP/s/GPU): 95.9 | learning rate: 3.075313E-07 | global batch size:     8 | lm loss: 7.784573E-01 | loss scale: 1.0 | grad norm: 0.670 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
 [2024-12-01 12:17:23] iteration      437/     500 | consumed samples:         3496 | elapsed time per iteration (ms): 733435.4 | throughput per GPU (TFLOP/s/GPU): 71.0 | learning rate: 3.011862E-07 | global batch size:     8 | lm loss: 7.929318E-01 | loss scale: 1.0 | grad norm: 0.814 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d512fe9900] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
processed_samples 18200 unjoint_samples 18200 joint_samples 54 [1037696, 787783]
processed_samples 18200 unjoint_samples 18200 joint_samples 56 [138085, 1046744]
processed_samples 18200 unjoint_samples 18200 joint_samples 56 [261580, 1047141]
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
processed_samples 18200 unjoint_samples 18200 joint_samples 55 [1045631, 237211]
processed_samples 18200 unjoint_samples 18200 joint_samples 54 [649075, 1047168]
processed_samples 18200 unjoint_samples 18200 joint_samples 55 [1023804, 113039]
processed_samples 18200 unjoint_samples 18200 joint_samples 54 [1047402, 646699]
processed_samples 18200 unjoint_samples 18200 joint_samples 53 [983067, 1043785]
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b928840] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
processed_samples 18200 unjoint_samples 18200 joint_samples 56 [261580, 1047141]
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
processed_samples 18200 unjoint_samples 18200 joint_samples 56 [138085, 1046744]
processed_samples 18200 unjoint_samples 18200 joint_samples 55 [1023804, 113039]
processed_samples 18200 unjoint_samples 18200 joint_samples 55 [1045631, 237211]
processed_samples 18200 unjoint_samples 18200 joint_samples 54 [1037696, 787783]
processed_samples 18200 unjoint_samples 18200 joint_samples 54 [649075, 1047168]
processed_samples 18200 unjoint_samples 18200 joint_samples 54 [1047402, 646699]
processed_samples 18200 unjoint_samples 18200 joint_samples 53 [983067, 1043785]
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
 [2024-12-01 12:27:08] iteration      438/     500 | consumed samples:         3504 | elapsed time per iteration (ms): 585613.9 | throughput per GPU (TFLOP/s/GPU): 88.9 | learning rate: 2.949354E-07 | global batch size:     8 | lm loss: 8.072410E-01 | loss scale: 1.0 | grad norm: 0.638 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
 [2024-12-01 12:37:04] iteration      439/     500 | consumed samples:         3512 | elapsed time per iteration (ms): 596157.9 | throughput per GPU (TFLOP/s/GPU): 87.3 | learning rate: 2.887793E-07 | global batch size:     8 | lm loss: 8.036357E-01 | loss scale: 1.0 | grad norm: 0.636 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
processed_samples 18300 unjoint_samples 18300 joint_samples 54 [1047402, 907297]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 18300 unjoint_samples 18300 joint_samples 55 [103075, 1019416]
processed_samples 18300 unjoint_samples 18300 joint_samples 55 [1023804, 370838]
processed_samples 18300 unjoint_samples 18300 joint_samples 54 [290617, 1046042]
processed_samples 18300 unjoint_samples 18300 joint_samples 55 [1045631, 490566]
processed_samples 18300 unjoint_samples 18300 joint_samples 56 [635398, 1047141]
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
processed_samples 18300 unjoint_samples 18300 joint_samples 56 [444978, 1046744]
processed_samples 18300 unjoint_samples 18300 joint_samples 54 [1047402, 907297]
processed_samples 18300 unjoint_samples 18300 joint_samples 55 [103075, 1019416]
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
processed_samples 18300 unjoint_samples 18300 joint_samples 54 [290617, 1046042]
processed_samples 18300 unjoint_samples 18300 joint_samples 55 [1023804, 370838]
processed_samples 18300 unjoint_samples 18300 joint_samples 54 [941634, 1047168]
processed_samples 18300 unjoint_samples 18300 joint_samples 56 [444978, 1046744]
processed_samples 18300 unjoint_samples 18300 joint_samples 55 [1045631, 490566]
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
processed_samples 18300 unjoint_samples 18300 joint_samples 56 [635398, 1047141]
[h264 @ 0x55d51314e5c0] mmco: unref short failure
processed_samples 18300 unjoint_samples 18300 joint_samples 54 [941634, 1047168]
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
 [2024-12-01 12:47:49] iteration      440/     500 | consumed samples:         3520 | elapsed time per iteration (ms): 644450.1 | throughput per GPU (TFLOP/s/GPU): 80.8 | learning rate: 2.827180E-07 | global batch size:     8 | lm loss: 8.524545E-01 | loss scale: 1.0 | grad norm: 0.960 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (149544.11, 149544.48)
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
 [2024-12-01 12:58:26] iteration      441/     500 | consumed samples:         3528 | elapsed time per iteration (ms): 487681.0 | throughput per GPU (TFLOP/s/GPU): 106.7 | learning rate: 2.767519E-07 | global batch size:     8 | lm loss: 8.442378E-01 | loss scale: 1.0 | grad norm: 0.576 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d516c21040] Missing reference picture, default is 65530
[h264 @ 0x55d516c21040] Missing reference picture, default is 65530
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d733d80] Missing reference picture, default is 65530
[h264 @ 0x56215d733d80] Missing reference picture, default is 65530
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1047402, 146642]
processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1047402, 146642]
processed_samples 18400 unjoint_samples 18400 joint_samples 55 [362857, 1019416]
processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1045631, 844468]
processed_samples 18400 unjoint_samples 18400 joint_samples 56 [919411, 1047141]
processed_samples 18400 unjoint_samples 18400 joint_samples 55 [995693, 286355]
processed_samples 18400 unjoint_samples 18400 joint_samples 54 [574148, 1046042]
processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1023804, 651751]
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
processed_samples 18400 unjoint_samples 18400 joint_samples 56 [692029, 1046744]
processed_samples 18400 unjoint_samples 18400 joint_samples 54 [574148, 1046042]
processed_samples 18400 unjoint_samples 18400 joint_samples 55 [362857, 1019416]
processed_samples 18400 unjoint_samples 18400 joint_samples 55 [995693, 286355]
processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1023804, 651751]
processed_samples 18400 unjoint_samples 18400 joint_samples 56 [692029, 1046744]
processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1045631, 844468]
[h264 @ 0x56215b17e600] mmco: unref short failure
processed_samples 18400 unjoint_samples 18400 joint_samples 56 [919411, 1047141]
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
 [2024-12-01 13:09:14] iteration      442/     500 | consumed samples:         3536 | elapsed time per iteration (ms): 647586.4 | throughput per GPU (TFLOP/s/GPU): 80.4 | learning rate: 2.708811E-07 | global batch size:     8 | lm loss: 8.337551E-01 | loss scale: 1.0 | grad norm: 0.604 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
 [2024-12-01 13:18:43] iteration      443/     500 | consumed samples:         3544 | elapsed time per iteration (ms): 568784.5 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 2.651060E-07 | global batch size:     8 | lm loss: 8.120881E-01 | loss scale: 1.0 | grad norm: 0.553 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5140bdd40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
processed_samples 18500 unjoint_samples 18500 joint_samples 56 [31916, 1044722]
processed_samples 18500 unjoint_samples 18500 joint_samples 56 [31916, 1044722]
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
processed_samples 18500 unjoint_samples 18500 joint_samples 55 [1047402, 518299]
processed_samples 18500 unjoint_samples 18500 joint_samples 55 [1047402, 518299]
processed_samples 18500 unjoint_samples 18500 joint_samples 54 [900010, 1046042]
processed_samples 18500 unjoint_samples 18500 joint_samples 55 [640036, 1019416]
processed_samples 18500 unjoint_samples 18500 joint_samples 57 [134414, 1047141]
processed_samples 18500 unjoint_samples 18500 joint_samples 54 [900010, 1046042]
processed_samples 18500 unjoint_samples 18500 joint_samples 55 [1023804, 1012752]
processed_samples 18500 unjoint_samples 18500 joint_samples 55 [640036, 1019416]
processed_samples 18500 unjoint_samples 18500 joint_samples 55 [1023804, 1012752]
processed_samples 18500 unjoint_samples 18500 joint_samples 57 [134414, 1047141]
processed_samples 18500 unjoint_samples 18500 joint_samples 55 [995693, 510802]
processed_samples 18500 unjoint_samples 18500 joint_samples 55 [995693, 510802]
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
processed_samples 18500 unjoint_samples 18500 joint_samples 56 [943092, 1046744]
processed_samples 18500 unjoint_samples 18500 joint_samples 56 [943092, 1046744]
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215b453c80] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
 [2024-12-01 13:30:27] iteration      444/     500 | consumed samples:         3552 | elapsed time per iteration (ms): 704704.3 | throughput per GPU (TFLOP/s/GPU): 73.8 | learning rate: 2.594267E-07 | global batch size:     8 | lm loss: 8.168926E-01 | loss scale: 1.0 | grad norm: 0.650 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d5140b3b40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
 [2024-12-01 13:41:06] iteration      445/     500 | consumed samples:         3560 | elapsed time per iteration (ms): 638414.9 | throughput per GPU (TFLOP/s/GPU): 81.5 | learning rate: 2.538436E-07 | global batch size:     8 | lm loss: 8.196800E-01 | loss scale: 1.0 | grad norm: 0.853 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
 [2024-12-01 13:50:59] iteration      446/     500 | consumed samples:         3568 | elapsed time per iteration (ms): 593547.3 | throughput per GPU (TFLOP/s/GPU): 87.7 | learning rate: 2.483568E-07 | global batch size:     8 | lm loss: 8.253284E-01 | loss scale: 1.0 | grad norm: 0.566 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
processed_samples 18600 unjoint_samples 18600 joint_samples 55 [118948, 1046602]
processed_samples 18600 unjoint_samples 18600 joint_samples 56 [280311, 1044722]
processed_samples 18600 unjoint_samples 18600 joint_samples 57 [1026406, 248733]
processed_samples 18600 unjoint_samples 18600 joint_samples 55 [1047402, 815598]
processed_samples 18600 unjoint_samples 18600 joint_samples 57 [555359, 1047141]
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
processed_samples 18600 unjoint_samples 18600 joint_samples 55 [980454, 1019416]
processed_samples 18600 unjoint_samples 18600 joint_samples 56 [298276, 1041104]
processed_samples 18600 unjoint_samples 18600 joint_samples 55 [118948, 1046602]
processed_samples 18600 unjoint_samples 18600 joint_samples 55 [995693, 815666]
processed_samples 18600 unjoint_samples 18600 joint_samples 56 [280311, 1044722]
processed_samples 18600 unjoint_samples 18600 joint_samples 57 [1026406, 248733]
processed_samples 18600 unjoint_samples 18600 joint_samples 55 [1047402, 815598]
processed_samples 18600 unjoint_samples 18600 joint_samples 57 [555359, 1047141]
processed_samples 18600 unjoint_samples 18600 joint_samples 55 [980454, 1019416]
processed_samples 18600 unjoint_samples 18600 joint_samples 56 [298276, 1041104]
[h264 @ 0x55d514426700] mmco: unref short failure
processed_samples 18600 unjoint_samples 18600 joint_samples 55 [995693, 815666]
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
 [2024-12-01 13:59:36] iteration      447/     500 | consumed samples:         3576 | elapsed time per iteration (ms): 517153.1 | throughput per GPU (TFLOP/s/GPU): 100.6 | learning rate: 2.429665E-07 | global batch size:     8 | lm loss: 7.605423E-01 | loss scale: 1.0 | grad norm: 0.673 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
 [2024-12-01 14:09:35] iteration      448/     500 | consumed samples:         3584 | elapsed time per iteration (ms): 598084.5 | throughput per GPU (TFLOP/s/GPU): 87.0 | learning rate: 2.376731E-07 | global batch size:     8 | lm loss: 7.981852E-01 | loss scale: 1.0 | grad norm: 0.653 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
processed_samples 18700 unjoint_samples 18700 joint_samples 57 [1026406, 551230]
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
processed_samples 18700 unjoint_samples 18700 joint_samples 57 [1026406, 551230]
processed_samples 18700 unjoint_samples 18700 joint_samples 56 [253149, 1047183]
processed_samples 18700 unjoint_samples 18700 joint_samples 56 [253149, 1047183]
processed_samples 18700 unjoint_samples 18700 joint_samples 56 [25383, 1041705]
processed_samples 18700 unjoint_samples 18700 joint_samples 56 [25383, 1041705]
processed_samples 18700 unjoint_samples 18700 joint_samples 55 [386254, 1046602]
processed_samples 18700 unjoint_samples 18700 joint_samples 55 [386254, 1046602]
processed_samples 18700 unjoint_samples 18700 joint_samples 56 [735119, 1044722]
processed_samples 18700 unjoint_samples 18700 joint_samples 56 [735119, 1044722]
processed_samples 18700 unjoint_samples 18700 joint_samples 57 [767829, 1047141]
processed_samples 18700 unjoint_samples 18700 joint_samples 56 [788303, 1041104]
processed_samples 18700 unjoint_samples 18700 joint_samples 57 [767829, 1047141]
processed_samples 18700 unjoint_samples 18700 joint_samples 56 [788303, 1041104]
processed_samples 18700 unjoint_samples 18700 joint_samples 55 [1033694, 1034262]
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
processed_samples 18700 unjoint_samples 18700 joint_samples 55 [1033694, 1034262]
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
 [2024-12-01 14:19:52] iteration      449/     500 | consumed samples:         3592 | elapsed time per iteration (ms): 617372.2 | throughput per GPU (TFLOP/s/GPU): 84.3 | learning rate: 2.324767E-07 | global batch size:     8 | lm loss: 8.294312E-01 | loss scale: 1.0 | grad norm: 0.597 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x56215f0f3d40] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x56215af7d040] mmco: unref short failure
[h264 @ 0x55d51a2c2780] mmco: unref short failure
[h264 @ 0x55d51a2c2780] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
 [2024-12-01 14:27:43] iteration      450/     500 | consumed samples:         3600 | elapsed time per iteration (ms): 470732.4 | throughput per GPU (TFLOP/s/GPU): 110.6 | learning rate: 2.273775E-07 | global batch size:     8 | lm loss: 7.941829E-01 | loss scale: 1.0 | grad norm: 0.700 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51a2c2780] mmco: unref short failure
 [2024-12-01 14:40:02] iteration      451/     500 | consumed samples:         3608 | elapsed time per iteration (ms): 739733.5 | throughput per GPU (TFLOP/s/GPU): 70.4 | learning rate: 2.223758E-07 | global batch size:     8 | lm loss: 8.482308E-01 | loss scale: 1.0 | grad norm: 0.898 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1046793, 55820]
processed_samples 18800 unjoint_samples 18800 joint_samples 55 [792957, 1046602]
processed_samples 18800 unjoint_samples 18800 joint_samples 55 [792957, 1046602]
processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1046793, 55820]
processed_samples 18800 unjoint_samples 18800 joint_samples 56 [260160, 1041705]
processed_samples 18800 unjoint_samples 18800 joint_samples 56 [1038546, 315206]
processed_samples 18800 unjoint_samples 18800 joint_samples 56 [260160, 1041705]
processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1026406, 971244]
processed_samples 18800 unjoint_samples 18800 joint_samples 56 [1038546, 315206]
processed_samples 18800 unjoint_samples 18800 joint_samples 56 [472463, 1047183]
processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1047210, 144209]
processed_samples 18800 unjoint_samples 18800 joint_samples 56 [472463, 1047183]
processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1047210, 144209]
processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1026406, 971244]
processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1031035, 1047141]
processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1031035, 1047141]
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x56215b879700] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215ba5b0c0] mmco: unref short failure
 [2024-12-01 14:48:07] iteration      452/     500 | consumed samples:         3616 | elapsed time per iteration (ms): 484888.6 | throughput per GPU (TFLOP/s/GPU): 107.3 | learning rate: 2.174717E-07 | global batch size:     8 | lm loss: 7.795358E-01 | loss scale: 1.0 | grad norm: 0.744 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
 [2024-12-01 14:55:46] iteration      453/     500 | consumed samples:         3624 | elapsed time per iteration (ms): 458864.6 | throughput per GPU (TFLOP/s/GPU): 113.4 | learning rate: 2.126655E-07 | global batch size:     8 | lm loss: 7.805303E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x562163d38640] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x56215de0a2c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
 [2024-12-01 15:05:07] iteration      454/     500 | consumed samples:         3632 | elapsed time per iteration (ms): 560659.6 | throughput per GPU (TFLOP/s/GPU): 92.8 | learning rate: 2.079574E-07 | global batch size:     8 | lm loss: 8.343635E-01 | loss scale: 1.0 | grad norm: 0.889 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
processed_samples 18900 unjoint_samples 18900 joint_samples 56 [569433, 1041705]
processed_samples 18900 unjoint_samples 18900 joint_samples 56 [95527, 1046602]
processed_samples 18900 unjoint_samples 18900 joint_samples 58 [329619, 1029045]
processed_samples 18900 unjoint_samples 18900 joint_samples 57 [1046793, 412118]
processed_samples 18900 unjoint_samples 18900 joint_samples 56 [1038546, 728147]
processed_samples 18900 unjoint_samples 18900 joint_samples 57 [1047210, 474342]
processed_samples 18900 unjoint_samples 18900 joint_samples 58 [317017, 1047141]
processed_samples 18900 unjoint_samples 18900 joint_samples 56 [805862, 1047183]
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
processed_samples 18900 unjoint_samples 18900 joint_samples 56 [569433, 1041705]
processed_samples 18900 unjoint_samples 18900 joint_samples 56 [95527, 1046602]
processed_samples 18900 unjoint_samples 18900 joint_samples 58 [329619, 1029045]
processed_samples 18900 unjoint_samples 18900 joint_samples 57 [1046793, 412118]
processed_samples 18900 unjoint_samples 18900 joint_samples 56 [1038546, 728147]
processed_samples 18900 unjoint_samples 18900 joint_samples 57 [1047210, 474342]
processed_samples 18900 unjoint_samples 18900 joint_samples 58 [317017, 1047141]
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
processed_samples 18900 unjoint_samples 18900 joint_samples 56 [805862, 1047183]
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d4ee5f85c0] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d5145cc380] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
 [2024-12-01 15:15:40] iteration      455/     500 | consumed samples:         3640 | elapsed time per iteration (ms): 632799.3 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 2.033476E-07 | global batch size:     8 | lm loss: 8.013310E-01 | loss scale: 1.0 | grad norm: 0.688 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215b9b7740] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
processed_samples 19000 unjoint_samples 19000 joint_samples 57 [65066, 1047183]
processed_samples 19000 unjoint_samples 19000 joint_samples 56 [304472, 1046602]
processed_samples 19000 unjoint_samples 19000 joint_samples 58 [698928, 1029045]
processed_samples 19000 unjoint_samples 19000 joint_samples 56 [886793, 1041705]
processed_samples 19000 unjoint_samples 19000 joint_samples 57 [1046793, 730143]
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
processed_samples 19000 unjoint_samples 19000 joint_samples 57 [1047210, 906829]
processed_samples 19000 unjoint_samples 19000 joint_samples 58 [603869, 1047141]
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
processed_samples 19000 unjoint_samples 19000 joint_samples 56 [304472, 1046602]
processed_samples 19000 unjoint_samples 19000 joint_samples 56 [1043659, 1043971]
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
processed_samples 19000 unjoint_samples 19000 joint_samples 57 [65066, 1047183]
processed_samples 19000 unjoint_samples 19000 joint_samples 57 [1046793, 730143]
processed_samples 19000 unjoint_samples 19000 joint_samples 58 [698928, 1029045]
processed_samples 19000 unjoint_samples 19000 joint_samples 56 [886793, 1041705]
processed_samples 19000 unjoint_samples 19000 joint_samples 58 [603869, 1047141]
processed_samples 19000 unjoint_samples 19000 joint_samples 56 [1043659, 1043971]
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
processed_samples 19000 unjoint_samples 19000 joint_samples 57 [1047210, 906829]
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
 [2024-12-01 15:23:19] iteration      456/     500 | consumed samples:         3648 | elapsed time per iteration (ms): 459434.2 | throughput per GPU (TFLOP/s/GPU): 113.3 | learning rate: 1.988362E-07 | global batch size:     8 | lm loss: 8.068063E-01 | loss scale: 1.0 | grad norm: 0.831 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
 [2024-12-01 15:32:27] iteration      457/     500 | consumed samples:         3656 | elapsed time per iteration (ms): 547788.4 | throughput per GPU (TFLOP/s/GPU): 95.0 | learning rate: 1.944234E-07 | global batch size:     8 | lm loss: 8.406883E-01 | loss scale: 1.0 | grad norm: 0.614 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f63a6c0] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215f63a6c0] mmco: unref short failure
[h264 @ 0x56215f63a6c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d513099d80] mmco: unref short failure
[h264 @ 0x55d513099d80] mmco: unref short failure
 [2024-12-01 15:41:17] iteration      458/     500 | consumed samples:         3664 | elapsed time per iteration (ms): 530106.6 | throughput per GPU (TFLOP/s/GPU): 98.2 | learning rate: 1.901095E-07 | global batch size:     8 | lm loss: 8.074347E-01 | loss scale: 1.0 | grad norm: 0.774 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
processed_samples 19100 unjoint_samples 19100 joint_samples 57 [1048091, 24957]
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
processed_samples 19100 unjoint_samples 19100 joint_samples 57 [1046793, 1039709]
processed_samples 19100 unjoint_samples 19100 joint_samples 56 [675892, 1046602]
processed_samples 19100 unjoint_samples 19100 joint_samples 58 [920268, 1047141]
[h264 @ 0x55d5143806c0] mmco: unref short failure
processed_samples 19100 unjoint_samples 19100 joint_samples 58 [1047210, 309387]
processed_samples 19100 unjoint_samples 19100 joint_samples 57 [409035, 1046382]
processed_samples 19100 unjoint_samples 19100 joint_samples 57 [364518, 1047183]
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
processed_samples 19100 unjoint_samples 19100 joint_samples 58 [920268, 1047141]
processed_samples 19100 unjoint_samples 19100 joint_samples 57 [1048091, 24957]
[h264 @ 0x56215c3e4200] mmco: unref short failure
processed_samples 19100 unjoint_samples 19100 joint_samples 58 [1047210, 309387]
processed_samples 19100 unjoint_samples 19100 joint_samples 57 [409035, 1046382]
processed_samples 19100 unjoint_samples 19100 joint_samples 58 [983695, 1029045]
processed_samples 19100 unjoint_samples 19100 joint_samples 57 [364518, 1047183]
processed_samples 19100 unjoint_samples 19100 joint_samples 57 [1046793, 1039709]
processed_samples 19100 unjoint_samples 19100 joint_samples 58 [983695, 1029045]
processed_samples 19100 unjoint_samples 19100 joint_samples 56 [675892, 1046602]
[h264 @ 0x55d5171f5180] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
 [2024-12-01 15:53:37] iteration      459/     500 | consumed samples:         3672 | elapsed time per iteration (ms): 739678.3 | throughput per GPU (TFLOP/s/GPU): 70.4 | learning rate: 1.858946E-07 | global batch size:     8 | lm loss: 8.207039E-01 | loss scale: 1.0 | grad norm: 0.687 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
 [2024-12-01 16:03:51] iteration      460/     500 | consumed samples:         3680 | elapsed time per iteration (ms): 614020.7 | throughput per GPU (TFLOP/s/GPU): 84.8 | learning rate: 1.817789E-07 | global batch size:     8 | lm loss: 7.755750E-01 | loss scale: 1.0 | grad norm: 0.541 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (156851.09, 156851.38)
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d513099d80] mmco: unref short failure
[h264 @ 0x55d513099d80] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5145449c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
 [2024-12-01 16:15:19] iteration      461/     500 | consumed samples:         3688 | elapsed time per iteration (ms): 531317.9 | throughput per GPU (TFLOP/s/GPU): 97.9 | learning rate: 1.777626E-07 | global batch size:     8 | lm loss: 8.081508E-01 | loss scale: 1.0 | grad norm: 0.616 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
processed_samples 19200 unjoint_samples 19200 joint_samples 57 [737652, 1046382]
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 19200 unjoint_samples 19200 joint_samples 59 [1007487, 313847]
processed_samples 19200 unjoint_samples 19200 joint_samples 59 [167467, 1047141]
processed_samples 19200 unjoint_samples 19200 joint_samples 57 [1048091, 334724]
[h264 @ 0x56215b217200] mmco: unref short failure
processed_samples 19200 unjoint_samples 19200 joint_samples 58 [261142, 1041956]
processed_samples 19200 unjoint_samples 19200 joint_samples 57 [619786, 1047183]
processed_samples 19200 unjoint_samples 19200 joint_samples 58 [1047210, 573972]
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
processed_samples 19200 unjoint_samples 19200 joint_samples 56 [908267, 1046602]
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d512746f00] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
processed_samples 19200 unjoint_samples 19200 joint_samples 57 [737652, 1046382]
processed_samples 19200 unjoint_samples 19200 joint_samples 58 [1047210, 573972]
[h264 @ 0x56215b7c4500] mmco: unref short failure
processed_samples 19200 unjoint_samples 19200 joint_samples 59 [1007487, 313847]
processed_samples 19200 unjoint_samples 19200 joint_samples 59 [167467, 1047141]
processed_samples 19200 unjoint_samples 19200 joint_samples 57 [1048091, 334724]
processed_samples 19200 unjoint_samples 19200 joint_samples 58 [261142, 1041956]
processed_samples 19200 unjoint_samples 19200 joint_samples 57 [619786, 1047183]
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
processed_samples 19200 unjoint_samples 19200 joint_samples 56 [908267, 1046602]
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
 [2024-12-01 16:26:07] iteration      462/     500 | consumed samples:         3696 | elapsed time per iteration (ms): 648360.6 | throughput per GPU (TFLOP/s/GPU): 80.3 | learning rate: 1.738458E-07 | global batch size:     8 | lm loss: 8.348542E-01 | loss scale: 1.0 | grad norm: 0.687 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d513034b80] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
processed_samples 19300 unjoint_samples 19300 joint_samples 58 [485454, 1041956]
processed_samples 19300 unjoint_samples 19300 joint_samples 57 [1000455, 204511]
processed_samples 19300 unjoint_samples 19300 joint_samples 59 [1007487, 683177]
processed_samples 19300 unjoint_samples 19300 joint_samples 57 [1048091, 729811]
processed_samples 19300 unjoint_samples 19300 joint_samples 57 [897921, 1047183]
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
processed_samples 19300 unjoint_samples 19300 joint_samples 58 [1046284, 8585]
[h264 @ 0x56215b8cc100] mmco: unref short failure
[h264 @ 0x56215b8cc100] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
[h264 @ 0x56215cef2780] mmco: unref short failure
processed_samples 19300 unjoint_samples 19300 joint_samples 57 [1000455, 204511]
processed_samples 19300 unjoint_samples 19300 joint_samples 58 [485454, 1041956]
processed_samples 19300 unjoint_samples 19300 joint_samples 57 [1048091, 729811]
processed_samples 19300 unjoint_samples 19300 joint_samples 59 [1007487, 683177]
processed_samples 19300 unjoint_samples 19300 joint_samples 57 [897921, 1047183]
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
processed_samples 19300 unjoint_samples 19300 joint_samples 58 [1046284, 8585]
 [2024-12-01 16:36:01] iteration      463/     500 | consumed samples:         3704 | elapsed time per iteration (ms): 593789.3 | throughput per GPU (TFLOP/s/GPU): 87.6 | learning rate: 1.700287E-07 | global batch size:     8 | lm loss: 8.368133E-01 | loss scale: 1.0 | grad norm: 0.838 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 19300 unjoint_samples 19300 joint_samples 58 [1047210, 1009908]
processed_samples 19300 unjoint_samples 19300 joint_samples 58 [1047210, 1009908]
processed_samples 19300 unjoint_samples 19300 joint_samples 59 [363026, 1047141]
processed_samples 19300 unjoint_samples 19300 joint_samples 59 [363026, 1047141]
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
 [2024-12-01 16:44:38] iteration      464/     500 | consumed samples:         3712 | elapsed time per iteration (ms): 516905.7 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 1.663114E-07 | global batch size:     8 | lm loss: 8.324036E-01 | loss scale: 1.0 | grad norm: 0.696 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
 [2024-12-01 16:52:33] iteration      465/     500 | consumed samples:         3720 | elapsed time per iteration (ms): 475094.9 | throughput per GPU (TFLOP/s/GPU): 109.5 | learning rate: 1.626942E-07 | global batch size:     8 | lm loss: 7.755831E-01 | loss scale: 1.0 | grad norm: 0.612 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 17:02:51] iteration      466/     500 | consumed samples:         3728 | elapsed time per iteration (ms): 617440.9 | throughput per GPU (TFLOP/s/GPU): 84.3 | learning rate: 1.591772E-07 | global batch size:     8 | lm loss: 7.885929E-01 | loss scale: 1.0 | grad norm: 0.528 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
 [2024-12-01 17:14:28] iteration      467/     500 | consumed samples:         3736 | elapsed time per iteration (ms): 697117.0 | throughput per GPU (TFLOP/s/GPU): 74.7 | learning rate: 1.557604E-07 | global batch size:     8 | lm loss: 8.216323E-01 | loss scale: 1.0 | grad norm: 0.602 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 19400 unjoint_samples 19400 joint_samples 58 [816460, 1041956]
processed_samples 19400 unjoint_samples 19400 joint_samples 58 [816460, 1041956]
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d5135c10c0] mmco: unref short failure
[h264 @ 0x55d5135c10c0] mmco: unref short failure
processed_samples 19400 unjoint_samples 19400 joint_samples 59 [1047252, 252965]
processed_samples 19400 unjoint_samples 19400 joint_samples 58 [1048091, 71684]
processed_samples 19400 unjoint_samples 19400 joint_samples 58 [1048091, 71684]
processed_samples 19400 unjoint_samples 19400 joint_samples 59 [1047252, 252965]
processed_samples 19400 unjoint_samples 19400 joint_samples 58 [130945, 1047183]
processed_samples 19400 unjoint_samples 19400 joint_samples 58 [130945, 1047183]
processed_samples 19400 unjoint_samples 19400 joint_samples 60 [200308, 967315]
processed_samples 19400 unjoint_samples 19400 joint_samples 58 [1046284, 348046]
processed_samples 19400 unjoint_samples 19400 joint_samples 60 [200308, 967315]
processed_samples 19400 unjoint_samples 19400 joint_samples 57 [1000455, 463197]
processed_samples 19400 unjoint_samples 19400 joint_samples 57 [1000455, 463197]
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
processed_samples 19400 unjoint_samples 19400 joint_samples 58 [1046284, 348046]
processed_samples 19400 unjoint_samples 19400 joint_samples 59 [762947, 1047141]
processed_samples 19400 unjoint_samples 19400 joint_samples 59 [762947, 1047141]
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215b89c080] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d5135c10c0] mmco: unref short failure
[h264 @ 0x55d5135c10c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b10e440] mmco: unref short failure
[h264 @ 0x56215b0a5600] mmco: unref short failure
[h264 @ 0x55d51bd54680] mmco: unref short failure
[h264 @ 0x55d51bd54680] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
 [2024-12-01 17:23:18] iteration      468/     500 | consumed samples:         3744 | elapsed time per iteration (ms): 530298.5 | throughput per GPU (TFLOP/s/GPU): 98.1 | learning rate: 1.524441E-07 | global batch size:     8 | lm loss: 8.004850E-01 | loss scale: 1.0 | grad norm: 0.500 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d516b59080] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
 [2024-12-01 17:33:32] iteration      469/     500 | consumed samples:         3752 | elapsed time per iteration (ms): 613465.0 | throughput per GPU (TFLOP/s/GPU): 84.8 | learning rate: 1.492284E-07 | global batch size:     8 | lm loss: 8.411949E-01 | loss scale: 1.0 | grad norm: 0.694 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
processed_samples 19500 unjoint_samples 19500 joint_samples 59 [98421, 1044129]
processed_samples 19500 unjoint_samples 19500 joint_samples 60 [520072, 967315]
processed_samples 19500 unjoint_samples 19500 joint_samples 58 [1048091, 498445]
processed_samples 19500 unjoint_samples 19500 joint_samples 59 [1047252, 580686]
processed_samples 19500 unjoint_samples 19500 joint_samples 59 [98421, 1044129]
processed_samples 19500 unjoint_samples 19500 joint_samples 58 [428724, 1047183]
processed_samples 19500 unjoint_samples 19500 joint_samples 58 [428724, 1047183]
processed_samples 19500 unjoint_samples 19500 joint_samples 58 [1046284, 654391]
processed_samples 19500 unjoint_samples 19500 joint_samples 57 [1000455, 750201]
processed_samples 19500 unjoint_samples 19500 joint_samples 58 [1048091, 498445]
processed_samples 19500 unjoint_samples 19500 joint_samples 60 [520072, 967315]
processed_samples 19500 unjoint_samples 19500 joint_samples 59 [1047252, 580686]
processed_samples 19500 unjoint_samples 19500 joint_samples 58 [1046284, 654391]
processed_samples 19500 unjoint_samples 19500 joint_samples 60 [1046315, 65258]
processed_samples 19500 unjoint_samples 19500 joint_samples 60 [1046315, 65258]
[h264 @ 0x56215d350080] mmco: unref short failure
processed_samples 19500 unjoint_samples 19500 joint_samples 57 [1000455, 750201]
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x56215af78cc0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51bddb580] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
 [2024-12-01 17:43:55] iteration      470/     500 | consumed samples:         3760 | elapsed time per iteration (ms): 623450.1 | throughput per GPU (TFLOP/s/GPU): 83.5 | learning rate: 1.461135E-07 | global batch size:     8 | lm loss: 7.550452E-01 | loss scale: 1.0 | grad norm: 0.538 | number of skipped iterations:   0 | number of nan iterations:   0 |
processed_samples 19600 unjoint_samples 19600 joint_samples 59 [375679, 1044129]
processed_samples 19600 unjoint_samples 19600 joint_samples 58 [31722, 1040879]
processed_samples 19600 unjoint_samples 19600 joint_samples 59 [1047252, 855252]
processed_samples 19600 unjoint_samples 19600 joint_samples 58 [663528, 1047183]
processed_samples 19600 unjoint_samples 19600 joint_samples 60 [816049, 967315]
processed_samples 19600 unjoint_samples 19600 joint_samples 60 [1046315, 403777]
processed_samples 19600 unjoint_samples 19600 joint_samples 58 [1048091, 839233]
processed_samples 19600 unjoint_samples 19600 joint_samples 58 [1046284, 989007]
processed_samples 19600 unjoint_samples 19600 joint_samples 59 [375679, 1044129]
processed_samples 19600 unjoint_samples 19600 joint_samples 58 [31722, 1040879]
processed_samples 19600 unjoint_samples 19600 joint_samples 58 [1048091, 839233]
processed_samples 19600 unjoint_samples 19600 joint_samples 60 [816049, 967315]
processed_samples 19600 unjoint_samples 19600 joint_samples 58 [663528, 1047183]
processed_samples 19600 unjoint_samples 19600 joint_samples 60 [1046315, 403777]
processed_samples 19600 unjoint_samples 19600 joint_samples 59 [1047252, 855252]
processed_samples 19600 unjoint_samples 19600 joint_samples 58 [1046284, 989007]
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d51714d1c0] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
 [2024-12-01 17:53:55] iteration      471/     500 | consumed samples:         3768 | elapsed time per iteration (ms): 600048.5 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 1.430994E-07 | global batch size:     8 | lm loss: 8.090125E-01 | loss scale: 1.0 | grad norm: 0.647 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d516aa3180] mmco: unref short failure
[h264 @ 0x55d516aa3180] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d516aa3180] mmco: unref short failure
[h264 @ 0x55d516aa3180] mmco: unref short failure
[h264 @ 0x55d516aa3180] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
 [2024-12-01 18:03:05] iteration      472/     500 | consumed samples:         3776 | elapsed time per iteration (ms): 550162.4 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 1.401863E-07 | global batch size:     8 | lm loss: 8.100052E-01 | loss scale: 1.0 | grad norm: 0.765 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d516b67a00] mmco: unref short failure
[h264 @ 0x56215d8bbcc0] mmco: unref short failure
 [2024-12-01 18:13:30] iteration      473/     500 | consumed samples:         3784 | elapsed time per iteration (ms): 625071.6 | throughput per GPU (TFLOP/s/GPU): 83.3 | learning rate: 1.373743E-07 | global batch size:     8 | lm loss: 8.084416E-01 | loss scale: 1.0 | grad norm: 0.651 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x55d514051f80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
 [2024-12-01 18:21:31] iteration      474/     500 | consumed samples:         3792 | elapsed time per iteration (ms): 480291.6 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 1.346635E-07 | global batch size:     8 | lm loss: 8.181583E-01 | loss scale: 1.0 | grad norm: 0.827 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
processed_samples 19700 unjoint_samples 19700 joint_samples 59 [84539, 1047531]
processed_samples 19700 unjoint_samples 19700 joint_samples 59 [946056, 1044129]
processed_samples 19700 unjoint_samples 19700 joint_samples 59 [954011, 205390]
processed_samples 19700 unjoint_samples 19700 joint_samples 60 [112581, 1045897]
processed_samples 19700 unjoint_samples 19700 joint_samples 59 [184479, 1038729]
processed_samples 19700 unjoint_samples 19700 joint_samples 60 [1046315, 844762]
processed_samples 19700 unjoint_samples 19700 joint_samples 60 [1015354, 1016036]
processed_samples 19700 unjoint_samples 19700 joint_samples 59 [84539, 1047531]
processed_samples 19700 unjoint_samples 19700 joint_samples 59 [946056, 1044129]
processed_samples 19700 unjoint_samples 19700 joint_samples 60 [112581, 1045897]
processed_samples 19700 unjoint_samples 19700 joint_samples 59 [954011, 205390]
processed_samples 19700 unjoint_samples 19700 joint_samples 59 [184479, 1038729]
processed_samples 19700 unjoint_samples 19700 joint_samples 58 [562460, 1040879]
processed_samples 19700 unjoint_samples 19700 joint_samples 60 [1046315, 844762]
processed_samples 19700 unjoint_samples 19700 joint_samples 58 [562460, 1040879]
processed_samples 19700 unjoint_samples 19700 joint_samples 60 [1015354, 1016036]
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215f8b9c80] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
 [2024-12-01 18:33:01] iteration      475/     500 | consumed samples:         3800 | elapsed time per iteration (ms): 690389.1 | throughput per GPU (TFLOP/s/GPU): 75.4 | learning rate: 1.320541E-07 | global batch size:     8 | lm loss: 8.206989E-01 | loss scale: 1.0 | grad norm: 0.651 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d4f080e440] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
 [2024-12-01 18:44:01] iteration      476/     500 | consumed samples:         3808 | elapsed time per iteration (ms): 659896.9 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 1.295461E-07 | global batch size:     8 | lm loss: 8.581502E-01 | loss scale: 1.0 | grad norm: 0.634 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 18:50:58] iteration      477/     500 | consumed samples:         3816 | elapsed time per iteration (ms): 417078.0 | throughput per GPU (TFLOP/s/GPU): 124.8 | learning rate: 1.271397E-07 | global batch size:     8 | lm loss: 7.727896E-01 | loss scale: 1.0 | grad norm: 0.577 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x56215b25ac00] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
processed_samples 19800 unjoint_samples 19800 joint_samples 61 [265482, 1044262]
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
processed_samples 19800 unjoint_samples 19800 joint_samples 60 [1046343, 311230]
processed_samples 19800 unjoint_samples 19800 joint_samples 59 [542442, 1047531]
processed_samples 19800 unjoint_samples 19800 joint_samples 61 [232279, 1034490]
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
processed_samples 19800 unjoint_samples 19800 joint_samples 58 [1026052, 1040879]
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
processed_samples 19800 unjoint_samples 19800 joint_samples 59 [462838, 1038729]
processed_samples 19800 unjoint_samples 19800 joint_samples 60 [473328, 1045897]
processed_samples 19800 unjoint_samples 19800 joint_samples 61 [265482, 1044262]
processed_samples 19800 unjoint_samples 19800 joint_samples 61 [232279, 1034490]
processed_samples 19800 unjoint_samples 19800 joint_samples 59 [542442, 1047531]
processed_samples 19800 unjoint_samples 19800 joint_samples 60 [1046343, 311230]
processed_samples 19800 unjoint_samples 19800 joint_samples 59 [954011, 559959]
processed_samples 19800 unjoint_samples 19800 joint_samples 58 [1026052, 1040879]
processed_samples 19800 unjoint_samples 19800 joint_samples 59 [462838, 1038729]
processed_samples 19800 unjoint_samples 19800 joint_samples 60 [473328, 1045897]
processed_samples 19800 unjoint_samples 19800 joint_samples 59 [954011, 559959]
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
 [2024-12-01 19:01:37] iteration      478/     500 | consumed samples:         3824 | elapsed time per iteration (ms): 639452.6 | throughput per GPU (TFLOP/s/GPU): 81.4 | learning rate: 1.248349E-07 | global batch size:     8 | lm loss: 7.901743E-01 | loss scale: 1.0 | grad norm: 0.554 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d5173c1180] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d516aa3180] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x562163b30a80] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
processed_samples 19900 unjoint_samples 19900 joint_samples 59 [1046416, 204673]
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
processed_samples 19900 unjoint_samples 19900 joint_samples 59 [977170, 980548]
processed_samples 19900 unjoint_samples 19900 joint_samples 60 [1046343, 707013]
processed_samples 19900 unjoint_samples 19900 joint_samples 59 [851276, 1047531]
processed_samples 19900 unjoint_samples 19900 joint_samples 61 [525522, 1034490]
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
processed_samples 19900 unjoint_samples 19900 joint_samples 60 [795319, 1045897]
processed_samples 19900 unjoint_samples 19900 joint_samples 61 [601682, 1044262]
processed_samples 19900 unjoint_samples 19900 joint_samples 59 [746959, 1038729]
[h264 @ 0x55d51314e5c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215b35e3c0] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
processed_samples 19900 unjoint_samples 19900 joint_samples 61 [525522, 1034490]
processed_samples 19900 unjoint_samples 19900 joint_samples 59 [1046416, 204673]
processed_samples 19900 unjoint_samples 19900 joint_samples 61 [601682, 1044262]
processed_samples 19900 unjoint_samples 19900 joint_samples 60 [1046343, 707013]
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
processed_samples 19900 unjoint_samples 19900 joint_samples 59 [851276, 1047531]
processed_samples 19900 unjoint_samples 19900 joint_samples 59 [746959, 1038729]
processed_samples 19900 unjoint_samples 19900 joint_samples 59 [977170, 980548]
processed_samples 19900 unjoint_samples 19900 joint_samples 60 [795319, 1045897]
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x55d514594d00] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215e2d08c0] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
 [2024-12-01 19:14:07] iteration      479/     500 | consumed samples:         3832 | elapsed time per iteration (ms): 749352.0 | throughput per GPU (TFLOP/s/GPU): 69.4 | learning rate: 1.226319E-07 | global batch size:     8 | lm loss: 7.682730E-01 | loss scale: 1.0 | grad norm: 0.636 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
 [2024-12-01 19:21:56] iteration      480/     500 | consumed samples:         3840 | elapsed time per iteration (ms): 468975.4 | throughput per GPU (TFLOP/s/GPU): 111.0 | learning rate: 1.205308E-07 | global batch size:     8 | lm loss: 7.834165E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations:   0 | number of nan iterations:   0 |
(min, max) time across ranks (ms):
    save-checkpoint ................................: (269678.47, 269678.92)
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
 [2024-12-01 19:36:42] iteration      481/     500 | consumed samples:         3848 | elapsed time per iteration (ms): 616920.2 | throughput per GPU (TFLOP/s/GPU): 84.4 | learning rate: 1.185315E-07 | global batch size:     8 | lm loss: 7.638129E-01 | loss scale: 1.0 | grad norm: 0.725 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
processed_samples 20000 unjoint_samples 20000 joint_samples 60 [1046343, 1010438]
processed_samples 20000 unjoint_samples 20000 joint_samples 60 [1044665, 100033]
processed_samples 20000 unjoint_samples 20000 joint_samples 60 [312813, 998374]
processed_samples 20000 unjoint_samples 20000 joint_samples 59 [1046416, 608727]
processed_samples 20000 unjoint_samples 20000 joint_samples 61 [914952, 1044262]
processed_samples 20000 unjoint_samples 20000 joint_samples 60 [1046343, 1010438]
processed_samples 20000 unjoint_samples 20000 joint_samples 60 [1044665, 100033]
processed_samples 20000 unjoint_samples 20000 joint_samples 59 [1046416, 608727]
processed_samples 20000 unjoint_samples 20000 joint_samples 60 [312813, 998374]
processed_samples 20000 unjoint_samples 20000 joint_samples 59 [1012305, 1038729]
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 20000 unjoint_samples 20000 joint_samples 61 [914952, 1044262]
processed_samples 20000 unjoint_samples 20000 joint_samples 61 [1047462, 61045]
processed_samples 20000 unjoint_samples 20000 joint_samples 59 [1012305, 1038729]
[h264 @ 0x56215c492cc0] mmco: unref short failure
processed_samples 20000 unjoint_samples 20000 joint_samples 61 [1047462, 61045]
processed_samples 20000 unjoint_samples 20000 joint_samples 61 [812346, 1034490]
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
processed_samples 20000 unjoint_samples 20000 joint_samples 61 [812346, 1034490]
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
 [2024-12-01 19:43:55] iteration      482/     500 | consumed samples:         3856 | elapsed time per iteration (ms): 432689.8 | throughput per GPU (TFLOP/s/GPU): 120.3 | learning rate: 1.166343E-07 | global batch size:     8 | lm loss: 7.994441E-01 | loss scale: 1.0 | grad norm: 0.656 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 19:57:16] iteration      483/     500 | consumed samples:         3864 | elapsed time per iteration (ms): 800993.8 | throughput per GPU (TFLOP/s/GPU): 65.0 | learning rate: 1.148392E-07 | global batch size:     8 | lm loss: 8.334441E-01 | loss scale: 1.0 | grad norm: 0.609 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d5149ada40] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d516aee0c0] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
 [2024-12-01 20:06:06] iteration      484/     500 | consumed samples:         3872 | elapsed time per iteration (ms): 529985.7 | throughput per GPU (TFLOP/s/GPU): 98.2 | learning rate: 1.131463E-07 | global batch size:     8 | lm loss: 8.393610E-01 | loss scale: 1.0 | grad norm: 0.637 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x55d516adca40] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x56215d350080] mmco: unref short failure
[h264 @ 0x55d51300b900] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x55d516aa3180] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215babf100] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d51478a440] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x56215c077380] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x55d5171918c0] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
 [2024-12-01 20:14:04] iteration      485/     500 | consumed samples:         3880 | elapsed time per iteration (ms): 477879.8 | throughput per GPU (TFLOP/s/GPU): 108.9 | learning rate: 1.115556E-07 | global batch size:     8 | lm loss: 8.007555E-01 | loss scale: 1.0 | grad norm: 0.675 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x5621606a3fc0] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
processed_samples 20100 unjoint_samples 20100 joint_samples 61 [1047462, 266841]
processed_samples 20100 unjoint_samples 20100 joint_samples 61 [400497, 1033549]
processed_samples 20100 unjoint_samples 20100 joint_samples 60 [668134, 998374]
processed_samples 20100 unjoint_samples 20100 joint_samples 62 [191447, 1044311]
processed_samples 20100 unjoint_samples 20100 joint_samples 62 [65666, 1040241]
processed_samples 20100 unjoint_samples 20100 joint_samples 60 [275657, 1044774]
processed_samples 20100 unjoint_samples 20100 joint_samples 60 [1044665, 487196]
processed_samples 20100 unjoint_samples 20100 joint_samples 59 [1046416, 1024246]
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
processed_samples 20100 unjoint_samples 20100 joint_samples 61 [1047462, 266841]
processed_samples 20100 unjoint_samples 20100 joint_samples 61 [400497, 1033549]
processed_samples 20100 unjoint_samples 20100 joint_samples 60 [668134, 998374]
processed_samples 20100 unjoint_samples 20100 joint_samples 62 [65666, 1040241]
processed_samples 20100 unjoint_samples 20100 joint_samples 62 [191447, 1044311]
processed_samples 20100 unjoint_samples 20100 joint_samples 60 [275657, 1044774]
processed_samples 20100 unjoint_samples 20100 joint_samples 60 [1044665, 487196]
processed_samples 20100 unjoint_samples 20100 joint_samples 59 [1046416, 1024246]
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x55d51407fb00] mmco: unref short failure
 [2024-12-01 20:22:12] iteration      486/     500 | consumed samples:         3888 | elapsed time per iteration (ms): 487957.7 | throughput per GPU (TFLOP/s/GPU): 106.7 | learning rate: 1.100672E-07 | global batch size:     8 | lm loss: 8.267097E-01 | loss scale: 1.0 | grad norm: 0.748 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x55d508176540] mmco: unref short failure
[h264 @ 0x56215c5f1200] mmco: unref short failure
[h264 @ 0x55d5135b3b40] mmco: unref short failure
[h264 @ 0x55d5135b3b40] mmco: unref short failure
[h264 @ 0x55d517cb1000] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215b523440] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c3b4780] mmco: unref short failure
[h264 @ 0x56215c3b4780] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d5135b3b40] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d5135b3b40] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x56215f5b0d40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x55d5139fcf40] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
processed_samples 20200 unjoint_samples 20200 joint_samples 62 [531865, 1044311]
processed_samples 20200 unjoint_samples 20200 joint_samples 60 [344919, 1038046]
[h264 @ 0x56215b1efec0] mmco: unref short failure
processed_samples 20200 unjoint_samples 20200 joint_samples 60 [1044665, 952866]
processed_samples 20200 unjoint_samples 20200 joint_samples 60 [584681, 1044774]
processed_samples 20200 unjoint_samples 20200 joint_samples 61 [1047462, 697513]
processed_samples 20200 unjoint_samples 20200 joint_samples 61 [645939, 1033549]
processed_samples 20200 unjoint_samples 20200 joint_samples 62 [413173, 1040241]
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
processed_samples 20200 unjoint_samples 20200 joint_samples 60 [945907, 998374]
[h264 @ 0x55d5145a1340] mmco: unref short failure
processed_samples 20200 unjoint_samples 20200 joint_samples 62 [531865, 1044311]
processed_samples 20200 unjoint_samples 20200 joint_samples 60 [344919, 1038046]
processed_samples 20200 unjoint_samples 20200 joint_samples 62 [413173, 1040241]
processed_samples 20200 unjoint_samples 20200 joint_samples 60 [1044665, 952866]
processed_samples 20200 unjoint_samples 20200 joint_samples 61 [645939, 1033549]
processed_samples 20200 unjoint_samples 20200 joint_samples 60 [584681, 1044774]
processed_samples 20200 unjoint_samples 20200 joint_samples 60 [945907, 998374]
processed_samples 20200 unjoint_samples 20200 joint_samples 61 [1047462, 697513]
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215b0a7480] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215be9ff00] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
 [2024-12-01 20:33:26] iteration      487/     500 | consumed samples:         3896 | elapsed time per iteration (ms): 674174.1 | throughput per GPU (TFLOP/s/GPU): 77.2 | learning rate: 1.086813E-07 | global batch size:     8 | lm loss: 7.753429E-01 | loss scale: 1.0 | grad norm: 0.588 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
 [2024-12-01 20:41:55] iteration      488/     500 | consumed samples:         3904 | elapsed time per iteration (ms): 508561.6 | throughput per GPU (TFLOP/s/GPU): 102.3 | learning rate: 1.073977E-07 | global batch size:     8 | lm loss: 8.101053E-01 | loss scale: 1.0 | grad norm: 0.628 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x56215b977f80] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215b1efec0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d512dcb440] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x56215cf234c0] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d51b873240] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
processed_samples 20300 unjoint_samples 20300 joint_samples 61 [192732, 1046867]
processed_samples 20300 unjoint_samples 20300 joint_samples 61 [171092, 1046713]
processed_samples 20300 unjoint_samples 20300 joint_samples 62 [836409, 1044311]
processed_samples 20300 unjoint_samples 20300 joint_samples 60 [867649, 1044774]
processed_samples 20300 unjoint_samples 20300 joint_samples 61 [1047462, 963152]
processed_samples 20300 unjoint_samples 20300 joint_samples 62 [716439, 1040241]
processed_samples 20300 unjoint_samples 20300 joint_samples 61 [945851, 1033549]
processed_samples 20300 unjoint_samples 20300 joint_samples 60 [618860, 1038046]
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
processed_samples 20300 unjoint_samples 20300 joint_samples 60 [618860, 1038046]
processed_samples 20300 unjoint_samples 20300 joint_samples 61 [192732, 1046867]
processed_samples 20300 unjoint_samples 20300 joint_samples 61 [171092, 1046713]
processed_samples 20300 unjoint_samples 20300 joint_samples 62 [836409, 1044311]
processed_samples 20300 unjoint_samples 20300 joint_samples 62 [716439, 1040241]
processed_samples 20300 unjoint_samples 20300 joint_samples 60 [867649, 1044774]
processed_samples 20300 unjoint_samples 20300 joint_samples 61 [1047462, 963152]
processed_samples 20300 unjoint_samples 20300 joint_samples 61 [945851, 1033549]
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
[h264 @ 0x56215ba1fac0] mmco: unref short failure
 [2024-12-01 20:51:39] iteration      489/     500 | consumed samples:         3912 | elapsed time per iteration (ms): 584105.2 | throughput per GPU (TFLOP/s/GPU): 89.1 | learning rate: 1.062166E-07 | global batch size:     8 | lm loss: 8.325140E-01 | loss scale: 1.0 | grad norm: 0.651 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x55d519613300] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
[h264 @ 0x56215b47d3c0] mmco: unref short failure
 [2024-12-01 20:59:55] iteration      490/     500 | consumed samples:         3920 | elapsed time per iteration (ms): 496115.7 | throughput per GPU (TFLOP/s/GPU): 104.9 | learning rate: 1.051381E-07 | global batch size:     8 | lm loss: 7.974008E-01 | loss scale: 1.0 | grad norm: 0.660 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x55d517306a80] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x56215bcba400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x56215b6ebf40] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
[h264 @ 0x56215b706800] mmco: unref short failure
 [2024-12-01 21:10:36] iteration      491/     500 | consumed samples:         3928 | elapsed time per iteration (ms): 640917.9 | throughput per GPU (TFLOP/s/GPU): 81.2 | learning rate: 1.041621E-07 | global batch size:     8 | lm loss: 8.110026E-01 | loss scale: 1.0 | grad norm: 0.596 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215b7c4500] mmco: unref short failure
[h264 @ 0x55d51340ff80] mmco: unref short failure
 [2024-12-01 21:21:02] iteration      492/     500 | consumed samples:         3936 | elapsed time per iteration (ms): 625653.9 | throughput per GPU (TFLOP/s/GPU): 83.2 | learning rate: 1.032888E-07 | global batch size:     8 | lm loss: 8.021253E-01 | loss scale: 1.0 | grad norm: 0.913 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x56215ec3c040] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x55d5143806c0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215ccdcdc0] mmco: unref short failure
[h264 @ 0x56215e474480] mmco: unref short failure
processed_samples 20400 unjoint_samples 20400 joint_samples 61 [603384, 1046713]
processed_samples 20400 unjoint_samples 20400 joint_samples 63 [73355, 1045348]
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
processed_samples 20400 unjoint_samples 20400 joint_samples 62 [1046106, 124157]
[h264 @ 0x56215f6b5640] mmco: unref short failure
[h264 @ 0x56215f6b5640] mmco: unref short failure
processed_samples 20400 unjoint_samples 20400 joint_samples 62 [1047462, 335378]
processed_samples 20400 unjoint_samples 20400 joint_samples 61 [1045911, 122833]
[h264 @ 0x55d51a586540] mmco: unref short failure
[h264 @ 0x55d51a586540] mmco: unref short failure
processed_samples 20400 unjoint_samples 20400 joint_samples 61 [609605, 1046867]
processed_samples 20400 unjoint_samples 20400 joint_samples 60 [989778, 1038046]
processed_samples 20400 unjoint_samples 20400 joint_samples 62 [974322, 1040241]
[h264 @ 0x55d5171918c0] mmco: unref short failure
processed_samples 20400 unjoint_samples 20400 joint_samples 61 [603384, 1046713]
processed_samples 20400 unjoint_samples 20400 joint_samples 63 [73355, 1045348]
[h264 @ 0x55d516c21040] mmco: unref short failure
[h264 @ 0x55d516c21040] mmco: unref short failure
processed_samples 20400 unjoint_samples 20400 joint_samples 61 [1045911, 122833]
processed_samples 20400 unjoint_samples 20400 joint_samples 62 [1046106, 124157]
processed_samples 20400 unjoint_samples 20400 joint_samples 62 [1047462, 335378]
processed_samples 20400 unjoint_samples 20400 joint_samples 61 [609605, 1046867]
processed_samples 20400 unjoint_samples 20400 joint_samples 60 [989778, 1038046]
processed_samples 20400 unjoint_samples 20400 joint_samples 62 [974322, 1040241]
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x56215ee100c0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
 [2024-12-01 21:28:07] iteration      493/     500 | consumed samples:         3944 | elapsed time per iteration (ms): 425120.9 | throughput per GPU (TFLOP/s/GPU): 122.4 | learning rate: 1.025181E-07 | global batch size:     8 | lm loss: 8.234768E-01 | loss scale: 1.0 | grad norm: 0.632 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514afd580] mmco: unref short failure
[h264 @ 0x55d514426700] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
 [2024-12-01 21:38:03] iteration      494/     500 | consumed samples:         3952 | elapsed time per iteration (ms): 596715.8 | throughput per GPU (TFLOP/s/GPU): 87.2 | learning rate: 1.018501E-07 | global batch size:     8 | lm loss: 7.998064E-01 | loss scale: 1.0 | grad norm: 0.650 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x56215bcafa40] mmco: unref short failure
[h264 @ 0x55d517193e00] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x55d517cb9180] mmco: unref short failure
[h264 @ 0x5621618e98c0] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b928840] mmco: unref short failure
[h264 @ 0x56215b928840] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d51401df80] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215cbc4280] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x56215f845e80] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d51421e400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1040126, 271248]
processed_samples 20500 unjoint_samples 20500 joint_samples 62 [1046106, 426680]
processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1045911, 338188]
processed_samples 20500 unjoint_samples 20500 joint_samples 63 [1047251, 198669]
processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1021682, 1046867]
processed_samples 20500 unjoint_samples 20500 joint_samples 63 [362330, 1045348]
processed_samples 20500 unjoint_samples 20500 joint_samples 61 [913941, 1046713]
processed_samples 20500 unjoint_samples 20500 joint_samples 62 [1047462, 561982]
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x56215bb8bc00] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
[h264 @ 0x56215b217200] mmco: unref short failure
processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1045911, 338188]
processed_samples 20500 unjoint_samples 20500 joint_samples 62 [1046106, 426680]
processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1040126, 271248]
processed_samples 20500 unjoint_samples 20500 joint_samples 63 [1047251, 198669]
[h264 @ 0x55d51bcf3900] mmco: unref short failure
processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1021682, 1046867]
processed_samples 20500 unjoint_samples 20500 joint_samples 63 [362330, 1045348]
[h264 @ 0x55d51bcf3900] mmco: unref short failure
[h264 @ 0x55d51bcf3900] mmco: unref short failure
processed_samples 20500 unjoint_samples 20500 joint_samples 61 [913941, 1046713]
processed_samples 20500 unjoint_samples 20500 joint_samples 62 [1047462, 561982]
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215f571a00] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d514409540] mmco: unref short failure
[h264 @ 0x5621608a8680] mmco: unref short failure
 [2024-12-01 21:47:09] iteration      495/     500 | consumed samples:         3960 | elapsed time per iteration (ms): 545102.0 | throughput per GPU (TFLOP/s/GPU): 95.5 | learning rate: 1.012849E-07 | global batch size:     8 | lm loss: 8.540187E-01 | loss scale: 1.0 | grad norm: 0.621 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x55d512fcdfc0] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bfb3a00] mmco: unref short failure
[h264 @ 0x56215bb86100] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x56215c492cc0] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x55d51722b380] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x56215f060d00] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d5194f31c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x55d51b2c01c0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215b17e600] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51376ab40] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x56215bb50880] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d516fe4400] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
 [2024-12-01 21:54:43] iteration      496/     500 | consumed samples:         3968 | elapsed time per iteration (ms): 454849.5 | throughput per GPU (TFLOP/s/GPU): 114.4 | learning rate: 1.008223E-07 | global batch size:     8 | lm loss: 8.191250E-01 | loss scale: 1.0 | grad norm: 0.725 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d516ae3f00] mmco: unref short failure
[h264 @ 0x55d5183f9f40] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x55d5145a1340] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x56215d8ddd40] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51326c900] mmco: unref short failure
[h264 @ 0x55d51713dd00] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215bf9f540] mmco: unref short failure
[h264 @ 0x56215baca280] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d51735fc80] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d517608ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d514b7c6c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215f0f40c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215d4974c0] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
processed_samples 20600 unjoint_samples 20600 joint_samples 61 [1040126, 588687]
processed_samples 20600 unjoint_samples 20600 joint_samples 62 [312567, 1046867]
processed_samples 20600 unjoint_samples 20600 joint_samples 62 [177108, 1046713]
processed_samples 20600 unjoint_samples 20600 joint_samples 62 [1047462, 923886]
processed_samples 20600 unjoint_samples 20600 joint_samples 61 [1045911, 666328]
processed_samples 20600 unjoint_samples 20600 joint_samples 63 [1047251, 636474]
processed_samples 20600 unjoint_samples 20600 joint_samples 63 [641892, 1045348]
processed_samples 20600 unjoint_samples 20600 joint_samples 62 [1046106, 784127]
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x55d5130aa740] mmco: unref short failure
[h264 @ 0x56215d4b4580] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
[h264 @ 0x55d518378ac0] mmco: unref short failure
processed_samples 20600 unjoint_samples 20600 joint_samples 61 [1040126, 588687]
processed_samples 20600 unjoint_samples 20600 joint_samples 62 [312567, 1046867]
processed_samples 20600 unjoint_samples 20600 joint_samples 62 [177108, 1046713]
processed_samples 20600 unjoint_samples 20600 joint_samples 63 [1047251, 636474]
processed_samples 20600 unjoint_samples 20600 joint_samples 61 [1045911, 666328]
processed_samples 20600 unjoint_samples 20600 joint_samples 63 [641892, 1045348]
processed_samples 20600 unjoint_samples 20600 joint_samples 62 [1047462, 923886]
processed_samples 20600 unjoint_samples 20600 joint_samples 62 [1046106, 784127]
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x562161968b80] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d51368aac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x55d516907ac0] mmco: unref short failure
[h264 @ 0x56215d83de00] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x562161d176c0] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x55d515131d40] mmco: unref short failure
[h264 @ 0x56215e692c80] mmco: unref short failure
[h264 @ 0x56215e692c80] mmco: unref short failure
[h264 @ 0x56215b75cc40] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
 [2024-12-01 22:05:19] iteration      497/     500 | consumed samples:         3976 | elapsed time per iteration (ms): 635852.3 | throughput per GPU (TFLOP/s/GPU): 81.8 | learning rate: 1.004626E-07 | global batch size:     8 | lm loss: 8.748317E-01 | loss scale: 1.0 | grad norm: 0.712 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x55d5144ae0c0] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x56215c18da40] mmco: unref short failure
[h264 @ 0x55d516e74e00] mmco: unref short failure
[h264 @ 0x562163486b40] mmco: unref short failure
[h264 @ 0x55d5139fff40] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x55d513765580] mmco: unref short failure
[h264 @ 0x562160004ec0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x56215c8ed2c0] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x55d5141fd280] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215d733d80] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x56215c354fc0] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d516b7ca40] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x55d513779c00] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215cf77480] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
[h264 @ 0x56215c19aa80] mmco: unref short failure
 [2024-12-01 22:15:15] iteration      498/     500 | consumed samples:         3984 | elapsed time per iteration (ms): 595413.6 | throughput per GPU (TFLOP/s/GPU): 87.4 | learning rate: 1.002056E-07 | global batch size:     8 | lm loss: 8.245853E-01 | loss scale: 1.0 | grad norm: 0.575 | number of skipped iterations:   0 | number of nan iterations:   0 |
[h264 @ 0x562160944a00] mmco: unref short failure
[h264 @ 0x55d513a64780] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x55d5148de040] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x56215be6bd40] mmco: unref short failure
[h264 @ 0x55d51a1f0840] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d5141f5f40] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x55d51723b300] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
[h264 @ 0x56215b3f09c0] mmco: unref short failure
processed_samples 20700 unjoint_samples 20700 joint_samples 62 [652277, 1046867]
[h264 @ 0x56215c37f280] mmco: unref short failure
[h264 @ 0x56215c37f280] mmco: unref short failure
processed_samples 20700 unjoint_samples 20700 joint_samples 63 [1046106, 76576]
processed_samples 20700 unjoint_samples 20700 joint_samples 62 [531223, 1046713]
processed_samples 20700 unjoint_samples 20700 joint_samples 63 [890598, 1045348]
processed_samples 20700 unjoint_samples 20700 joint_samples 63 [254512, 1032742]
processed_samples 20700 unjoint_samples 20700 joint_samples 61 [1040126, 867371]
processed_samples 20700 unjoint_samples 20700 joint_samples 63 [1047251, 935193]
processed_samples 20700 unjoint_samples 20700 joint_samples 61 [1045911, 992219]
[h264 @ 0x55d51a586540] mmco: unref short failure
processed_samples 20700 unjoint_samples 20700 joint_samples 62 [652277, 1046867]
processed_samples 20700 unjoint_samples 20700 joint_samples 63 [1046106, 76576]
processed_samples 20700 unjoint_samples 20700 joint_samples 62 [531223, 1046713]
processed_samples 20700 unjoint_samples 20700 joint_samples 63 [890598, 1045348]
processed_samples 20700 unjoint_samples 20700 joint_samples 63 [254512, 1032742]
processed_samples 20700 unjoint_samples 20700 joint_samples 63 [1047251, 935193]
processed_samples 20700 unjoint_samples 20700 joint_samples 61 [1040126, 867371]
processed_samples 20700 unjoint_samples 20700 joint_samples 61 [1045911, 992219]
[h264 @ 0x56215b2188c0] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d514261240] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x55d513904400] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215b4e5f40] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215c7c6240] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
[h264 @ 0x56215cc254c0] mmco: unref short failure
 [2024-12-01 22:23:50] iteration      499/     500 | consumed samples:         3992 | elapsed time per iteration (ms): 514810.0 | throughput per GPU (TFLOP/s/GPU): 101.1 | learning rate: 1.000514E-07 | global batch size:     8 | lm loss: 8.215189E-01 | loss scale: 1.0 | grad norm: 1.011 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2024-12-01 22:33:14] iteration      500/     500 | consumed samples:         4000 | elapsed time per iteration (ms): 564651.1 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 1.000000E-07 | global batch size:     8 | lm loss: 8.424357E-01 | loss scale: 1.0 | grad norm: 0.598 | number of skipped iterations:   0 | number of nan iterations:   0 |
[Errno 19] No such device: 'data_2'
[Errno 19] No such device: 'data_2'[Errno 19] No such device: 'data_2'

[Errno 19] No such device: 'data_2'
[Errno 19] No such device: 'data_2'
[Errno 19] No such device: 'data_2'
[Errno 19] No such device: 'data_2'
[Errno 19] No such device: 'data_2'
[Errno 19] No such device: 'data_2'
[Errno 19] No such device: 'data_2'[Errno 19] No such device: 'data_2'[Errno 19] No such device: 'data_2'


[Errno 19] No such device: 'data_2'
[Errno 19] No such device: 'data_2'
[Errno 19] No such device: 'data_2'
[Errno 19] No such device: 'data_2'
(min, max) time across ranks (ms):
    save-checkpoint ................................: (259269.35, 259269.62)
[2024-12-01 22:44:31,628] torch.distributed.elastic.agent.server.api: [ERROR] Error waiting on exit barrier. Elapsed: 300.10674571990967 seconds
+ set +x