+ echo Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//log_node31.txt Logging output to /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//log_node31.txt + export ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//ascend/31 + ASCEND_PROCESS_LOG_PATH=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//ascend/31 + mkdir -p /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743//ascend/31 + DATA_PATH=/local_disk/cognitron_vl//configs/lcvlm_finetune_stage4.yaml + TOKENIZER_PATH=/data_4/models/Qwen/Qwen2.5-14B-Instruct/ + CKPT_LOAD_DIR=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/ + VIT_CKPT_LOAD_DIR=/ + CKPT_SAVE_DIR=/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743// + rsync -avh /local_disk/cognitron_vl//configs/lcvlm_finetune_stage4.yaml /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743/ sending incremental file list sent 71 bytes received 12 bytes 166.00 bytes/sec total size is 23.84K speedup is 287.17 + cd /local_disk/cognitron_vl/ + rm -fr datasets + mkdir -p datasets + ln -s /data/data/ datasets/CV + ln -s /data/data/LLM datasets/LLM + ln -s /data/data/LMM datasets/LMM + source /local_disk/cognitron_vl//scripts/set_env_mg_npu.sh ++ source /usr/local/Ascend/driver/bin/setenv.bash +++ DEP_INFO_FILE=/etc/ascend_install.info +++ [[ -f /etc/ascend_install.info ]] +++ . /etc/ascend_install.info +++ DRV_LIB64_COMMON_LDPATH=/driver/lib64/common +++ DRV_LIB64_DRV_LDPATH=/driver/lib64/driver +++ DRV_LIB64_LDPATH=/driver/lib64 +++ export LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ LD_LIBRARY_PATH=/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin +++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin ++ source /usr/local/Ascend/ascend-toolkit/set_env.sh +++ export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ export ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest +++ ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest ++++ arch +++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/driver/lib64/common:/driver/lib64/driver:/driver/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64:/usr/local/Ascend/ascend-toolkit/latest/tools/aml/lib64/plugin:/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/x86_64:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/lib/x86_64-linux-gnu/hdf5/serial: +++ export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: +++ PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: +++ export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin +++ PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/tools/ccec_compiler/bin:/root/miniconda3/envs/py38/bin:/root/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin:/usr/local/bin +++ export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest +++ ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest +++ export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp +++ ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp +++ export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit +++ TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit +++ export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest +++ ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest ++ export HCCL_CONNECT_TIMEOUT=7200 ++ HCCL_CONNECT_TIMEOUT=7200 ++ export HCCL_EXEC_TIMEOUT=7200 ++ HCCL_EXEC_TIMEOUT=7200 ++ export COMBINED_ENABLE=1 ++ COMBINED_ENABLE=1 ++ export MULTI_STREAM_MEMORY_REUSE=1 ++ MULTI_STREAM_MEMORY_REUSE=1 ++ export HCCL_RDMA_TC=160 ++ HCCL_RDMA_TC=160 ++ export HCCL_RDMA_SL=5 ++ HCCL_RDMA_SL=5 ++ export HCCL_INTRA_PCIE_ENABLE=0 ++ HCCL_INTRA_PCIE_ENABLE=0 ++ export HCCL_INTRA_ROCE_ENABLE=1 ++ HCCL_INTRA_ROCE_ENABLE=1 ++ export HCCL_RDMA_TIMEOUT=20 ++ HCCL_RDMA_TIMEOUT=20 ++ export INF_NAN_MODE_ENABLE=1 ++ INF_NAN_MODE_ENABLE=1 ++ export DISTRIBUTED_BACKEND=hccl ++ DISTRIBUTED_BACKEND=hccl ++ export ASCEND_LAUNCH_BLOCKING=0 ++ ASCEND_LAUNCH_BLOCKING=0 ++ export ASCEND_SLOG_PRINT_TO_STDOUT=0 ++ ASCEND_SLOG_PRINT_TO_STDOUT=0 ++ export ASCEND_GLOBAL_LOG_LEVEL=3 ++ ASCEND_GLOBAL_LOG_LEVEL=3 ++ export ASCEND_GLOBAL_EVENT_ENABLE=0 ++ ASCEND_GLOBAL_EVENT_ENABLE=0 ++ export TASK_QUEUE_ENABLE=1 ++ TASK_QUEUE_ENABLE=1 ++ export PTCOPY_ENABLE=1 ++ PTCOPY_ENABLE=1 ++ export COMBINED_ENABLE=1 ++ COMBINED_ENABLE=1 ++ export DYNAMIC_OP=ADD#MUL ++ DYNAMIC_OP=ADD#MUL ++ export HCCL_WHITELIST_DISABLE=1 ++ HCCL_WHITELIST_DISABLE=1 ++ export HCCL_CONNECT_TIMEOUT=7200 ++ HCCL_CONNECT_TIMEOUT=7200 ++ export HCCL_WHITELIST_DISABLE=1 ++ HCCL_WHITELIST_DISABLE=1 ++ export CUDA_DEVICE_MAX_CONNECTIONS=1 ++ CUDA_DEVICE_MAX_CONNECTIONS=1 ++ pip3 install --no-index --find-links=/data/software/ -r requirements_npu.txt Looking in links: /data/software/ Processing data/software/expecttest-0.2.1-py3-none-any.whl (from -r requirements_npu.txt (line 1)) Requirement already satisfied: peft in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 2)) (0.7.0) Processing data/software/XlsxWriter-3.2.0-py3-none-any.whl (from -r requirements_npu.txt (line 3)) Requirement already satisfied: termcolor in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 4)) (2.4.0) Requirement already satisfied: tabulate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 5)) (0.9.0) Processing data/software/tiktoken-0.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 6)) Requirement already satisfied: matplotlib in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 7)) (3.7.5) Processing data/software/datasets-3.0.0-py3-none-any.whl (from -r requirements_npu.txt (line 8)) Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 9)) (0.7.0) Processing data/software/pybind11-2.13.6-py3-none-any.whl (from -r requirements_npu.txt (line 10)) Requirement already satisfied: tensorboardX in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 11)) (2.6.2.2) Processing data/software/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from -r requirements_npu.txt (line 12)) Requirement already satisfied: transformers>=4.40.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 13)) (4.40.1) Requirement already satisfied: deepspeed>=0.14.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 14)) (0.14.5) Processing data/software/accelerate-0.34.2-py3-none-any.whl (from -r requirements_npu.txt (line 15)) Requirement already satisfied: timm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from -r requirements_npu.txt (line 16)) (0.9.16) Processing data/software/flask-3.0.3-py3-none-any.whl (from -r requirements_npu.txt (line 17)) Processing data/software/Flask_RESTful-0.3.10-py2.py3-none-any.whl (from -r requirements_npu.txt (line 18)) Processing data/software/decord-0.6.0-py3-none-manylinux2010_x86_64.whl (from -r requirements_npu.txt (line 19)) Processing data/software/natsort-8.4.0-py3-none-any.whl (from -r requirements_npu.txt (line 20)) Requirement already satisfied: numpy>=1.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (1.24.4) Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (23.2) Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.9.8) Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (5.4.1) Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (2.1.0+cpu) Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (4.66.2) Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.4.2) Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft->-r requirements_npu.txt (line 2)) (0.20.3) Requirement already satisfied: regex>=2022.1.18 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2023.12.25) Requirement already satisfied: requests>=2.26.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tiktoken->-r requirements_npu.txt (line 6)) (2.31.0) Requirement already satisfied: contourpy>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.1.1) Requirement already satisfied: cycler>=0.10 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (4.49.0) Requirement already satisfied: kiwisolver>=1.0.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (1.4.5) Requirement already satisfied: pillow>=6.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (10.2.0) Requirement already satisfied: pyparsing>=2.3.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (3.1.1) Requirement already satisfied: python-dateutil>=2.7 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (2.8.2) Requirement already satisfied: importlib-resources>=3.2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from matplotlib->-r requirements_npu.txt (line 7)) (6.1.2) Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.13.1) Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.3.7) Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2.0.3) Processing data/software/requests-2.32.3-py3-none-any.whl (from tiktoken->-r requirements_npu.txt (line 6)) Processing data/software/tqdm-4.67.1-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2)) Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.4.1) Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (0.70.15) Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2023.10.0) Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets>=2.21.0->-r requirements_npu.txt (line 8)) (3.9.3) Processing data/software/huggingface_hub-0.26.2-py3-none-any.whl (from peft->-r requirements_npu.txt (line 2)) Requirement already satisfied: protobuf>=3.20 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from tensorboardX->-r requirements_npu.txt (line 11)) (4.25.3) Requirement already satisfied: tokenizers<0.20,>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers>=4.40.1->-r requirements_npu.txt (line 13)) (0.19.1) Requirement already satisfied: hjson in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (3.1.0) Requirement already satisfied: ninja in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.11.1.1) Requirement already satisfied: nvidia-ml-py in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (12.560.30) Requirement already satisfied: py-cpuinfo in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (9.0.0) Requirement already satisfied: pydantic in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from deepspeed>=0.14.2->-r requirements_npu.txt (line 14)) (1.10.15) Processing data/software/safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from peft->-r requirements_npu.txt (line 2)) Requirement already satisfied: torchvision in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from timm->-r requirements_npu.txt (line 16)) (0.16.0) Requirement already satisfied: Werkzeug>=3.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.0.1) Requirement already satisfied: Jinja2>=3.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (3.1.3) Processing data/software/itsdangerous-2.2.0-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17)) Requirement already satisfied: click>=8.1.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (8.1.7) Processing data/software/blinker-1.8.2-py3-none-any.whl (from flask->-r requirements_npu.txt (line 17)) Requirement already satisfied: importlib-metadata>=3.6.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask->-r requirements_npu.txt (line 17)) (7.0.1) Processing data/software/aniso8601-9.0.1-py2.py3-none-any.whl (from flask_restful->-r requirements_npu.txt (line 18)) Requirement already satisfied: six>=1.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (1.16.0) Requirement already satisfied: pytz in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from flask_restful->-r requirements_npu.txt (line 18)) (2024.1) Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.3.1) Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (23.2.0) Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.4.1) Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (6.0.5) Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (1.9.4) Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (4.0.3) Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft->-r requirements_npu.txt (line 2)) (4.10.0) Requirement already satisfied: zipp>=0.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from importlib-metadata>=3.6.0->flask->-r requirements_npu.txt (line 17)) (3.17.0) Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from Jinja2>=3.1.2->flask->-r requirements_npu.txt (line 17)) (2.1.5) Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (3.6) Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (1.26.18) Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests>=2.26.0->tiktoken->-r requirements_npu.txt (line 6)) (2024.2.2) Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.4) Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (3.1) Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets>=2.21.0->-r requirements_npu.txt (line 8)) (2024.1) Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->torch>=1.13.0->peft->-r requirements_npu.txt (line 2)) (1.3.0) DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 Installing collected packages: aniso8601, xlsxwriter, tqdm, safetensors, requests, pybind11, pyarrow, natsort, itsdangerous, expecttest, decord, blinker, tiktoken, huggingface-hub, flask, flask_restful, accelerate, datasets Attempting uninstall: tqdm Found existing installation: tqdm 4.66.2 Uninstalling tqdm-4.66.2: Successfully uninstalled tqdm-4.66.2 Attempting uninstall: safetensors Found existing installation: safetensors 0.4.2 Uninstalling safetensors-0.4.2: Successfully uninstalled safetensors-0.4.2 Attempting uninstall: requests Found existing installation: requests 2.31.0 Uninstalling requests-2.31.0: Successfully uninstalled requests-2.31.0 Attempting uninstall: pyarrow Found existing installation: pyarrow 15.0.0 Uninstalling pyarrow-15.0.0: Successfully uninstalled pyarrow-15.0.0 Attempting uninstall: huggingface-hub Found existing installation: huggingface-hub 0.20.3 Uninstalling huggingface-hub-0.20.3: Successfully uninstalled huggingface-hub-0.20.3 Attempting uninstall: accelerate Found existing installation: accelerate 0.25.0 Uninstalling accelerate-0.25.0: Successfully uninstalled accelerate-0.25.0 Attempting uninstall: datasets Found existing installation: datasets 2.16.0 Uninstalling datasets-2.16.0: Successfully uninstalled datasets-2.16.0 ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. tikit 1.8.2.240926 requires dicttoxml==1.7.4, which is not installed. tikit 1.8.2.240926 requires docopt==0.6.2, which is not installed. tikit 1.8.2.240926 requires future==0.18.2, which is not installed. tikit 1.8.2.240926 requires hdfs==2.6.0, which is not installed. tikit 1.8.2.240926 requires pure-sasl==0.6.2, which is not installed. tikit 1.8.2.240926 requires py4j==0.10.7, which is not installed. tikit 1.8.2.240926 requires PyHive[hive]==0.6.4, which is not installed. tikit 1.8.2.240926 requires pyjwt>=2.4.0, which is not installed. tikit 1.8.2.240926 requires requests-kerberos>=0.14.0, which is not installed. tikit 1.8.2.240926 requires sasl==0.3.1, which is not installed. tikit 1.8.2.240926 requires thrift==0.15.0, which is not installed. tikit 1.8.2.240926 requires thrift-sasl>=0.1.0, which is not installed. tikit 1.8.2.240926 requires certifi==2021.10.8, but you have certifi 2024.2.2 which is incompatible. tikit 1.8.2.240926 requires cos-python-sdk-v5==1.9.29, but you have cos-python-sdk-v5 1.9.26 which is incompatible. tikit 1.8.2.240926 requires idna==3.3, but you have idna 3.6 which is incompatible. tikit 1.8.2.240926 requires prettytable==2.5.0, but you have prettytable 3.11.0 which is incompatible. tikit 1.8.2.240926 requires urllib3==1.26.7, but you have urllib3 1.26.18 which is incompatible. tikit 1.8.2.240926 requires wcwidth==0.2.5, but you have wcwidth 0.2.13 which is incompatible. Successfully installed accelerate-0.34.2 aniso8601-9.0.1 blinker-1.8.2 datasets-3.0.0 decord-0.6.0 expecttest-0.2.1 flask-3.0.3 flask_restful-0.3.10 huggingface-hub-0.26.2 itsdangerous-2.2.0 natsort-8.4.0 pyarrow-17.0.0 pybind11-2.13.6 requests-2.32.3 safetensors-0.4.5 tiktoken-0.7.0 tqdm-4.67.1 xlsxwriter-3.2.0 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv ++ return 0 + MEGATRON_DIR=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/ + MINDSPEED_DIR=/local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/ + MODELLINK_DIR=/local_disk/cognitron_vl//third_party/ModelLink/ + pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0/ Looking in links: /data/software/ Obtaining file://local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0 Installing build dependencies: started Installing build dependencies: finished with status 'done' Checking if build backend supports build_editable: started Checking if build backend supports build_editable: finished with status 'done' Getting requirements to build editable: started Getting requirements to build editable: finished with status 'done' Installing backend dependencies: started Installing backend dependencies: finished with status 'done' Preparing editable metadata (pyproject.toml): started Preparing editable metadata (pyproject.toml): finished with status 'done' Building wheels for collected packages: megatron_core Building editable for megatron_core (pyproject.toml): started Building editable for megatron_core (pyproject.toml): finished with status 'done' Created wheel for megatron_core: filename=megatron_core-0.6.0-0.editable-cp38-cp38-linux_x86_64.whl size=8791 sha256=1c8a73544a768ff0759eb2db03ef8e548406a6700abe057332d8072922777a16 Stored in directory: /tmp/pip-ephem-wheel-cache-f3i1665g/wheels/54/9c/d1/d2015aa0c34e791e64d65d19395e5a9a5528f0c63fd519b9ff Successfully built megatron_core DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 Installing collected packages: megatron_core Successfully installed megatron_core-0.6.0 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv + pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/MindSpeed_core_r0.6.0/ Looking in links: /data/software/ Obtaining file://local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0 Preparing metadata (setup.py): started Preparing metadata (setup.py): finished with status 'done' WARNING: Error parsing requirements for tokenizers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/tokenizers-0.19.1.dist-info/METADATA' WARNING: Error parsing requirements for transformers: [Errno 2] No such file or directory: '/root/miniconda3/envs/py38/lib/python3.8/site-packages/transformers-4.40.1.dist-info/METADATA' DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 Installing collected packages: mindspeed Running setup.py develop for mindspeed Successfully installed mindspeed-0.6.0 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv + pip3 install --no-index --find-links=/data/software/ -e /local_disk/cognitron_vl//third_party/ModelLink/ Looking in links: /data/software/ Obtaining file://local_disk/cognitron_vl/third_party/ModelLink Preparing metadata (setup.py): started Preparing metadata (setup.py): finished with status 'done' Requirement already satisfied: numpy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.24.4) Processing data/software/transformers-4.43.2-py3-none-any.whl (from modellink==0.0.1) Processing data/software/transformers-stream-generator-0.0.5.tar.gz (from modellink==0.0.1) Preparing metadata (setup.py): started Preparing metadata (setup.py): finished with status 'done' Requirement already satisfied: sympy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.4) Requirement already satisfied: decorator in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (5.1.1) Requirement already satisfied: scipy in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.10.1) Requirement already satisfied: sentencepiece in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.2.0) Requirement already satisfied: einops in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0) Requirement already satisfied: datasets in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (3.0.0) Requirement already satisfied: pybind11 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (2.13.6) Requirement already satisfied: accelerate in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.34.2) Requirement already satisfied: six in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (1.16.0) Requirement already satisfied: protobuf in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (4.25.3) Processing data/software/peft-0.7.1-py3-none-any.whl (from modellink==0.0.1) Requirement already satisfied: tiktoken in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from modellink==0.0.1) (0.7.0) Requirement already satisfied: packaging>=20.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (23.2) Requirement already satisfied: psutil in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.9.8) Requirement already satisfied: pyyaml in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (5.4.1) Requirement already satisfied: torch>=1.13.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (2.1.0+cpu) Requirement already satisfied: tqdm in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (4.67.1) Requirement already satisfied: safetensors in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.4.5) Requirement already satisfied: huggingface-hub>=0.17.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from peft==0.7.1->modellink==0.0.1) (0.26.2) Requirement already satisfied: filelock in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (3.13.1) Requirement already satisfied: regex!=2019.12.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2023.12.25) Requirement already satisfied: requests in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from transformers==4.43.2->modellink==0.0.1) (2.32.3) Processing data/software/tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from transformers==4.43.2->modellink==0.0.1) Requirement already satisfied: pyarrow>=15.0.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (17.0.0) Requirement already satisfied: dill<0.3.9,>=0.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.3.7) Requirement already satisfied: pandas in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (2.0.3) Requirement already satisfied: xxhash in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.4.1) Requirement already satisfied: multiprocess in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (0.70.15) Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets->modellink==0.0.1) (2023.10.0) Requirement already satisfied: aiohttp in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from datasets->modellink==0.0.1) (3.9.3) Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from sympy->modellink==0.0.1) (1.3.0) Requirement already satisfied: aiosignal>=1.1.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.3.1) Requirement already satisfied: attrs>=17.3.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (23.2.0) Requirement already satisfied: frozenlist>=1.1.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.4.1) Requirement already satisfied: multidict<7.0,>=4.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (6.0.5) Requirement already satisfied: yarl<2.0,>=1.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (1.9.4) Requirement already satisfied: async-timeout<5.0,>=4.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from aiohttp->datasets->modellink==0.0.1) (4.0.3) Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from huggingface-hub>=0.17.0->peft==0.7.1->modellink==0.0.1) (4.10.0) Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (3.6) Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (1.26.18) Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from requests->transformers==4.43.2->modellink==0.0.1) (2024.2.2) Requirement already satisfied: networkx in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1) Requirement already satisfied: jinja2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (3.1.3) Requirement already satisfied: python-dateutil>=2.8.2 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1) Requirement already satisfied: tzdata>=2022.1 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from pandas->datasets->modellink==0.0.1) (2024.1) Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/envs/py38/lib/python3.8/site-packages (from jinja2->torch>=1.13.0->peft==0.7.1->modellink==0.0.1) (2.1.5) Building wheels for collected packages: transformers_stream_generator Building wheel for transformers_stream_generator (setup.py): started Building wheel for transformers_stream_generator (setup.py): finished with status 'done' Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12425 sha256=53a0efa1548230be4832bd2d5f76d2b932ac2ffee1961d12082c62ce27bcc265 Stored in directory: /root/.cache/pip/wheels/56/8c/42/5381d9c36bc85f28982f4cf8f98dc44d37a6d6c04897a5cb7c Successfully built transformers_stream_generator DEPRECATION: apex 0.1-ascend-20240523 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of apex or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063 Installing collected packages: tokenizers, transformers, transformers_stream_generator, peft, modellink Attempting uninstall: tokenizers Found existing installation: tokenizers 0.20.3 Uninstalling tokenizers-0.20.3: Successfully uninstalled tokenizers-0.20.3 Attempting uninstall: transformers Found existing installation: transformers 4.46.3 Uninstalling transformers-4.46.3: Successfully uninstalled transformers-4.46.3 Attempting uninstall: peft Found existing installation: peft 0.7.0 Uninstalling peft-0.7.0: Successfully uninstalled peft-0.7.0 Running setup.py develop for modellink Successfully installed modellink-0.0.1 peft-0.7.1 tokenizers-0.19.1 transformers-4.43.2 transformers_stream_generator-0.0.5 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv + export PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: + PYTHONPATH=/local_disk/cognitron_vl//third_party/Megatron-LM_core_r0.6.0//:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe: + GPUS_PER_NODE=16 + NNODES=32 + NODE_RANK=31 + MASTER_PORT=34567 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + CUDA_DEVICE_MAX_CONNECTIONS=1 + export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + VISION_SEQ_LENGTH=1025 + IMAGE_TOKEN_LENGTH=256 + IMAGE_SIZE=448 + VISION_MODEL_TYPE=intern_300m + TP=8 + PP=1 + CP=8 + CP_ALGO=megatron_cp_algo + CP_MASK=causal + DISTRIBUTED_ARGS=' --nproc_per_node 16 --nnodes 32 --node_rank 31 --master_addr train-1198772881325351168-93vlj4s2getc-master-0.train-100034032793.svc.cluster.local --master_port 34567 ' + GPT_ARGS=' --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 8 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 1048576 --max-position-embeddings 1048576 --micro-batch-size 1 --global-batch-size 8 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 500 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-image 4096 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --logit-mask --cross-dataset-joint ' + DATA_ARGS=' --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage4.yaml --split 100,0,0 --data-seq-length 1048576 --num-workers 8 ' + CKPT_ARGS=' --load /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/ --vit-load / --no-load-optim --no-load-rng --seed 42424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743// ' + OUTPUT_ARGS=' --log-interval 1 --save-interval 20 --eval-interval 20 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 ' + torchrun --nproc_per_node 16 --nnodes 32 --node_rank 31 --master_addr train-1198772881325351168-93vlj4s2getc-master-0.train-100034032793.svc.cluster.local --master_port 34567 /local_disk/cognitron_vl//lcvlm_modellink/pretrain_lcvlm.py --use-mcore-models --tensor-model-parallel-size 8 --pipeline-model-parallel-size 1 --context-parallel-size 8 --context-parallel-algo megatron_cp_algo --cp-attention-mask-type causal --use-cp-send-recv-overlap --no-create-attention-mask-in-dataloader --sparse-mode 4 --sequence-parallel --recompute-method block --recompute-granularity full --recompute-num-layers 48 --num-layers 48 --hidden-size 5120 --ffn-hidden-size 13824 --num-attention-heads 40 --group-query-attention --num-query-groups 8 --tokenizer-type PretrainedFromHF --tokenizer-name-or-path /data_4/models/Qwen/Qwen2.5-14B-Instruct/ --seq-length 1048576 --max-position-embeddings 1048576 --micro-batch-size 1 --global-batch-size 8 --make-vocab-size-divisible-by 1 --padded-vocab-size 152064 --rotary-base 1000000.0 --lr 5.00e-6 --train-iters 500 --lr-decay-style cosine --untie-embeddings-and-output-weights --disable-bias-linear --attention-dropout 0.0 --init-method-std 0.01 --hidden-dropout 0.0 --position-embedding-type rope --normalization RMSNorm --use-fused-rmsnorm --norm-epsilon 1e-6 --swiglu --use-flash-attn --use-fused-rotary-pos-emb --use-rotary-position-embeddings --use-fused-swiglu --use-mc2 --no-masked-softmax-fusion --attention-softmax-in-fp32 --min-lr 1.00e-7 --weight-decay 0.0 --lr-warmup-fraction 0.03 --clip-grad 1.0 --adam-beta1 0.9 --adam-beta2 0.999 --add-qkv-bias --initial-loss-scale 4096 --no-gradient-accumulation-fusion --use-distributed-optimizer --bf16 --overlap-grad-reduce --finetune --vision-model-freeze --vision-model-type intern_300m --vision-downsample-ratio 0.5 --vision-projector-type mlp --vision-projector-pre-norm --vision-process-type dynamic --vision-normalize-type imagenet --vision-seq-length 1025 --image-token-length 256 --image-size 448 --prompt-format qwen2 --is-instruction-dataset --max-num-image 4096 --max-fps 1 --add-class-token --min-patch-grid 1 --max-patch-grid 12 --logit-mask --cross-dataset-joint --data-path /local_disk/cognitron_vl//configs/lcvlm_finetune_stage4.yaml --split 100,0,0 --data-seq-length 1048576 --num-workers 8 --load /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/ --vit-load / --no-load-optim --no-load-rng --seed 42424242 --save /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp8_stage4.sh/20241128_234743// --log-interval 1 --save-interval 20 --eval-interval 20 --eval-iters 0 --log-throughput --distributed-timeout-minutes 120 --distributed-backend nccl [2024-11-28 15:50:34,775] torch.distributed.run: [WARNING] [2024-11-28 15:50:34,775] torch.distributed.run: [WARNING] ***************************************** [2024-11-28 15:50:34,775] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. [2024-11-28 15:50:34,775] torch.distributed.run: [WARNING] ***************************************** Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Creating extension directory /root/.cache/torch_extensions/py38_cpu/adaptive_cp... Emitting ninja build file /root/.cache/torch_extensions/py38_cpu/adaptive_cp/build.ninja... Building extension module adaptive_cp... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... Using /root/.cache/torch_extensions/py38_cpu as PyTorch extensions root... [1/2] c++ -MMD -MF adaptive_cp.o.d -DTORCH_EXTENSION_NAME=adaptive_cp -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/usr/local/Ascend/ascend-toolkit/latest/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/include -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/third_party -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/acl -I/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/inc -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/TH -isystem /root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/include/THC -isystem /root/miniconda3/envs/py38/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack -fPIC -pie -Wl,--disable-new-dtags,--rpath -s -O2 -c local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/ops/csrc/algorithm/adaptive_cp/adaptive_cp.cpp -o adaptive_cp.o [2/2] c++ adaptive_cp.o -shared -L/usr/local/Ascend/ascend-toolkit/latest/lib64 -lascendcl -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch_npu/lib -ltorch_npu -L/root/miniconda3/envs/py38/lib/python3.8/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o adaptive_cp.so Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... Loading extension module adaptive_cp... Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") Loading extension module adaptive_cp... Loading extension module adaptive_cp... Loading extension module adaptive_cp... local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") local_disk/cognitron_vl/third_party/MindSpeed_core_r0.6.0/mindspeed/core/tensor_parallel/layers.py:30: UserWarning: failed to generate the npu_matmul_add_fp32 warnings.warn("failed to generate the npu_matmul_add_fp32") /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( /root/miniconda3/envs/py38/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source? warn( > compiling dataset index builder ... make: Entering directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets' make: Nothing to be done for 'default'. make: Leaving directory 'local_disk/cognitron_vl/third_party/Megatron-LM_core_r0.6.0/megatron/core/datasets' >>> done with dataset index builder. Compilation time: 0.483 seconds vision_projector_recompute False vision_projector_recompute False vision_projector_recompute Falsevision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_projector_recompute False vision_model_freeze => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.vision_model_freeze => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.vision_model_freeze vision_model_freeze => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. vision_model_freeze => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False.model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. vision_model_freeze=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.vision_model_freeze=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False.=> set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() )=> set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) vision_model_freeze => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) vision_model_freeze => set param external_feature_model.vit.class_token torch.Size([1, 1, 1024]) requires grad to False. => set param external_feature_model.vit.conv1.weight torch.Size([1024, 3, 14, 14]) requires grad to False. => set param external_feature_model.vit.conv1.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.position_embeddings.weight torch.Size([1025, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.0.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.1.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.2.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.3.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.4.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.5.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.6.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.7.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.8.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.9.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.10.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.11.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.12.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.13.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.14.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.15.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.16.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.17.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.18.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.19.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.20.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.21.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.22.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls1 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.ls2 torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.input_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.weight torch.Size([1024, 128]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_proj.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.weight torch.Size([384, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.self_attention.linear_qkv.bias torch.Size([384]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.weight torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.pre_mlp_layernorm.bias torch.Size([1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.weight torch.Size([512, 1024]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc1.bias torch.Size([512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.weight torch.Size([1024, 512]) requires grad to False. => set param external_feature_model.vit.decoder.layers.23.mlp.linear_fc2.bias torch.Size([1024]) requires grad to False. model GPTVLModel( (external_feature_model): MegatronVisionModel( (vit): InternViTModel( (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) (position_embeddings): Embedding(1025, 1024) (decoder): TransformerBlock( (layers): ModuleList( (0-23): 24 x InternViTTransformerLayer( (input_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() ) (self_attn_bda): IdentityFuncOp() (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) (mlp_bda): IdentityFuncOp() ) ) ) ) (vision_projection): MultimodalProjector( (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) (pre_proj_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): RotaryEmbedding() (decoder): TransformerBlock( (layers): ModuleList( (0-47): 48 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): SelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False)_get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc1.weight key (1.0, 1.0, False, False)_get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.vision_projection.encoder.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.external_feature_model.pre_proj_layernorm.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.embedding.word_embeddings.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.0.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.1.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.2.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.3.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.4.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.5.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.6.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.7.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.8.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.9.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.10.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.11.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.12.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.13.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.14.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.15.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.16.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.17.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.18.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.19.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.20.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.21.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.22.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.23.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.24.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.25.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.26.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.27.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.28.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.29.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.30.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.31.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.32.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.33.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.34.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.35.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.36.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.37.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.38.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.39.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.40.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.41.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.42.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.43.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.44.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.45.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.46.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.input_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_proj.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.self_attention.linear_qkv.bias key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.pre_mlp_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc1.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.layers.47.mlp.linear_fc2.weight key (1.0, 1.0, False, False) _get_param_groups name module.module.decoder.final_layernorm.weight key (0.0, 1.0, False, False) _get_param_groups name module.module.output_layer.weight key (1.0, 1.0, False, False) _load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration_load_base_checkpoint iteration _load_base_checkpoint iteration 1000_load_base_checkpoint iteration _load_base_checkpoint iteration _load_base_checkpoint iteration _load_base_checkpoint release 100010001000 1000_load_base_checkpoint release _load_base_checkpoint iteration 10001000 False_load_base_checkpoint iteration 1000 _load_base_checkpoint release _load_base_checkpoint release 10001000 1000 False 1000_load_base_checkpoint release_load_base_checkpoint release_load_base_checkpoint release 1000False False_load_base_checkpoint release1000 1000 _load_base_checkpoint release_load_base_checkpoint release False False _load_base_checkpoint release False _load_base_checkpoint release1000False _load_base_checkpoint releaseFalse_load_base_checkpoint release False_load_base_checkpoint release_load_base_checkpoint release False False False FalseFalse False _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_02/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_00/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_03/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_05/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_00/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_01/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_02/model_optim_rng.pt _load_base_checkpoint_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_07/model_optim_rng.pt/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_04/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_03/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_04/model_optim_rng.pt _load_base_checkpoint_load_base_checkpoint_load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_06/model_optim_rng.pt/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_01/model_optim_rng.pt/data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_06/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_05/model_optim_rng.pt _load_base_checkpoint /data_2/output/LM/scripts/modellink/qwen25/finetune_qwen25_14b_intern_300m_ptd_tp8pp1cp2_stage3.sh/20241127_204213/iter_0001000/mp_rank_07/model_optim_rng.pt load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True load_checkpoint iteration 0 load_checkpoint release False strict True (min, max) time across ranks (ms): load-checkpoint ................................: (35346.45, 35346.82) > rank 506 does not create GPT datasets ... > rank 507 does not create GPT datasets ... > rank 499 does not create GPT datasets ... > rank 505 does not create GPT datasets ...> rank 503 does not create GPT datasets ... > rank 501 does not create GPT datasets ... > rank 511 does not create GPT datasets ...> rank 509 does not create GPT datasets ... > rank 497 does not create GPT datasets ... > rank 498 does not create GPT datasets ... > rank 504 is creating GPT datasets ...> rank 502 does not create GPT datasets ...> rank 508 does not create GPT datasets ... > rank 510 does not create GPT datasets ... > rank 500 does not create GPT datasets ... > rank 496 is creating GPT datasets ... target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)] possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]] target_ratios [(1, 1), (1, 2), (2, 1), (3, 1), (1, 3), (2, 2), (4, 1), (1, 4), (5, 1), (1, 5), (1, 6), (6, 1), (3, 2), (2, 3), (7, 1), (1, 7), (4, 2), (2, 4), (1, 8), (8, 1), (1, 9), (3, 3), (9, 1), (2, 5), (5, 2), (10, 1), (1, 10), (11, 1), (1, 11), (12, 1), (3, 4), (4, 3), (1, 12), (6, 2), (2, 6)] possible_resolutions [[448, 448], [448, 896], [896, 448], [1344, 448], [448, 1344], [896, 896], [1792, 448], [448, 1792], [2240, 448], [448, 2240], [448, 2688], [2688, 448], [1344, 896], [896, 1344], [3136, 448], [448, 3136], [1792, 896], [896, 1792], [448, 3584], [3584, 448], [448, 4032], [1344, 1344], [4032, 448], [896, 2240], [2240, 896], [4480, 448], [448, 4480], [4928, 448], [448, 4928], [5376, 448], [1344, 1792], [1792, 1344], [448, 5376], [2688, 896], [896, 2688]] (min, max) time across ranks (ms): model-and-optimizer-setup ......................: (35985.64, 35997.94) train/valid/test-data-iterators-setup ..........: (302758.09, 303131.07) [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51862cd80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51862cd80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure processed_samples 100 unjoint_samples 100 joint_samples 0 [136301, 185903] processed_samples 100 unjoint_samples 100 joint_samples 0 [136301, 185903] processed_samples 100 unjoint_samples 100 joint_samples 0 [161329, 159174] processed_samples 100 unjoint_samples 100 joint_samples 0 [230777, 221579] processed_samples 100 unjoint_samples 100 joint_samples 0 [135670, 136846] processed_samples 100 unjoint_samples 100 joint_samples 0 [161329, 159174] processed_samples 100 unjoint_samples 100 joint_samples 0 [230777, 221579] processed_samples 100 unjoint_samples 100 joint_samples 0 [135670, 136846] processed_samples 100 unjoint_samples 100 joint_samples 0 [185666, 185971] processed_samples 100 unjoint_samples 100 joint_samples 0 [185666, 185971] processed_samples 100 unjoint_samples 100 joint_samples 0 [136013, 137062] processed_samples 100 unjoint_samples 100 joint_samples 0 [136013, 137062] [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 100 unjoint_samples 100 joint_samples 0 [144154, 142029] processed_samples 100 unjoint_samples 100 joint_samples 0 [144154, 142029] processed_samples 100 unjoint_samples 100 joint_samples 0 [142372, 140436] processed_samples 100 unjoint_samples 100 joint_samples 0 [142372, 140436] [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure processed_samples 200 unjoint_samples 200 joint_samples 0 [442317, 476094] processed_samples 200 unjoint_samples 200 joint_samples 0 [442317, 476094] processed_samples 200 unjoint_samples 200 joint_samples 0 [304017, 303803] processed_samples 200 unjoint_samples 200 joint_samples 0 [304017, 303803] processed_samples 200 unjoint_samples 200 joint_samples 0 [308595, 302406] processed_samples 200 unjoint_samples 200 joint_samples 0 [308595, 302406] processed_samples 200 unjoint_samples 200 joint_samples 0 [301352, 305263] processed_samples 200 unjoint_samples 200 joint_samples 0 [301352, 305263] processed_samples 200 unjoint_samples 200 joint_samples 0 [317896, 339618] processed_samples 200 unjoint_samples 200 joint_samples 0 [317896, 339618] processed_samples 200 unjoint_samples 200 joint_samples 0 [394104, 382765] processed_samples 200 unjoint_samples 200 joint_samples 0 [394104, 382765] processed_samples 200 unjoint_samples 200 joint_samples 0 [276598, 277077] processed_samples 200 unjoint_samples 200 joint_samples 0 [276598, 277077] processed_samples 200 unjoint_samples 200 joint_samples 0 [361042, 360328] processed_samples 200 unjoint_samples 200 joint_samples 0 [361042, 360328] [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b928840] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b928840] mmco: unref short failure [h264 @ 0x56215b928840] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure processed_samples 300 unjoint_samples 300 joint_samples 0 [492229, 488923] processed_samples 300 unjoint_samples 300 joint_samples 0 [492229, 488923] processed_samples 300 unjoint_samples 300 joint_samples 0 [477837, 477325] processed_samples 300 unjoint_samples 300 joint_samples 0 [477837, 477325] processed_samples 300 unjoint_samples 300 joint_samples 0 [590298, 590047] processed_samples 300 unjoint_samples 300 joint_samples 0 [590298, 590047] processed_samples 300 unjoint_samples 300 joint_samples 0 [500107, 497881] processed_samples 300 unjoint_samples 300 joint_samples 0 [500107, 497881] processed_samples 300 unjoint_samples 300 joint_samples 0 [400463, 400576] processed_samples 300 unjoint_samples 300 joint_samples 0 [400463, 400576] processed_samples 300 unjoint_samples 300 joint_samples 0 [604230, 621018] processed_samples 300 unjoint_samples 300 joint_samples 0 [604230, 621018] processed_samples 300 unjoint_samples 300 joint_samples 0 [511598, 511244] processed_samples 300 unjoint_samples 300 joint_samples 0 [511598, 511244] processed_samples 300 unjoint_samples 300 joint_samples 0 [513168, 514079] processed_samples 300 unjoint_samples 300 joint_samples 0 [513168, 514079] [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d51327b000] mmco: unref short failure [h264 @ 0x55d51327b000] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure processed_samples 400 unjoint_samples 400 joint_samples 0 [645141, 644007] processed_samples 400 unjoint_samples 400 joint_samples 0 [645141, 644007] processed_samples 400 unjoint_samples 400 joint_samples 0 [682733, 684223] processed_samples 400 unjoint_samples 400 joint_samples 0 [682733, 684223] processed_samples 400 unjoint_samples 400 joint_samples 0 [629361, 635981] processed_samples 400 unjoint_samples 400 joint_samples 0 [629361, 635981] processed_samples 400 unjoint_samples 400 joint_samples 0 [672742, 688581] processed_samples 400 unjoint_samples 400 joint_samples 0 [672742, 688581] processed_samples 400 unjoint_samples 400 joint_samples 0 [773371, 774327] processed_samples 400 unjoint_samples 400 joint_samples 0 [773371, 774327] processed_samples 400 unjoint_samples 400 joint_samples 0 [707094, 705592] processed_samples 400 unjoint_samples 400 joint_samples 0 [707094, 705592] processed_samples 400 unjoint_samples 400 joint_samples 0 [658632, 655574] processed_samples 400 unjoint_samples 400 joint_samples 0 [658632, 655574] processed_samples 400 unjoint_samples 400 joint_samples 0 [821555, 816781] processed_samples 400 unjoint_samples 400 joint_samples 0 [821555, 816781] [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d508162180] mmco: unref short failure [h264 @ 0x55d508162180] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure processed_samples 500 unjoint_samples 500 joint_samples 0 [841955, 842678] processed_samples 500 unjoint_samples 500 joint_samples 0 [841955, 842678] processed_samples 500 unjoint_samples 500 joint_samples 0 [983147, 985847] processed_samples 500 unjoint_samples 500 joint_samples 0 [881560, 881559] processed_samples 500 unjoint_samples 500 joint_samples 0 [894887, 912251] processed_samples 500 unjoint_samples 500 joint_samples 0 [881560, 881559] processed_samples 500 unjoint_samples 500 joint_samples 0 [983147, 985847] processed_samples 500 unjoint_samples 500 joint_samples 0 [894887, 912251] processed_samples 500 unjoint_samples 500 joint_samples 0 [853383, 835984] processed_samples 500 unjoint_samples 500 joint_samples 0 [853383, 835984] processed_samples 500 unjoint_samples 500 joint_samples 0 [920412, 921212] processed_samples 500 unjoint_samples 500 joint_samples 0 [920412, 921212] processed_samples 500 unjoint_samples 500 joint_samples 0 [811098, 880230] processed_samples 500 unjoint_samples 500 joint_samples 0 [811098, 880230] processed_samples 500 unjoint_samples 500 joint_samples 0 [830129, 831340] processed_samples 500 unjoint_samples 500 joint_samples 0 [830129, 831340] [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure ................................................................................................ [2024-11-28 16:16:41] iteration 1/ 500 | consumed samples: 8 | elapsed time per iteration (ms): 1178502.3 | throughput per GPU (TFLOP/s/GPU): 44.2 | learning rate: 3.333333E-07 | global batch size: 8 | lm loss: 6.629787E+00 | loss scale: 1.0 | grad norm: 71.391 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b2e3080] mmco: unref short failure [h264 @ 0x56215b2e3080] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [2024-11-28 16:24:22] iteration 2/ 500 | consumed samples: 16 | elapsed time per iteration (ms): 460889.5 | throughput per GPU (TFLOP/s/GPU): 112.9 | learning rate: 6.666667E-07 | global batch size: 8 | lm loss: 6.407792E+00 | loss scale: 1.0 | grad norm: 50.194 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-28 16:30:23] iteration 3/ 500 | consumed samples: 24 | elapsed time per iteration (ms): 360271.6 | throughput per GPU (TFLOP/s/GPU): 144.4 | learning rate: 1.000000E-06 | global batch size: 8 | lm loss: 6.310083E+00 | loss scale: 1.0 | grad norm: 43.834 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x55d513c2d080] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 600 unjoint_samples 600 joint_samples 1 [1046784, 140984] processed_samples 600 unjoint_samples 600 joint_samples 1 [1046784, 140984] processed_samples 600 unjoint_samples 600 joint_samples 1 [992692, 253508] processed_samples 600 unjoint_samples 600 joint_samples 1 [992692, 253508] processed_samples 600 unjoint_samples 600 joint_samples 1 [329003, 1016149] processed_samples 600 unjoint_samples 600 joint_samples 1 [329003, 1016149] processed_samples 600 unjoint_samples 600 joint_samples 0 [999002, 1001861] processed_samples 600 unjoint_samples 600 joint_samples 0 [1023421, 1025360] processed_samples 600 unjoint_samples 600 joint_samples 0 [935448, 933940] processed_samples 600 unjoint_samples 600 joint_samples 0 [935448, 933940] processed_samples 600 unjoint_samples 600 joint_samples 0 [1023421, 1025360] processed_samples 600 unjoint_samples 600 joint_samples 0 [999002, 1001861] processed_samples 600 unjoint_samples 600 joint_samples 0 [1028346, 1017978] processed_samples 600 unjoint_samples 600 joint_samples 0 [1028346, 1017978] processed_samples 600 unjoint_samples 600 joint_samples 0 [968364, 969654] processed_samples 600 unjoint_samples 600 joint_samples 0 [968364, 969654] [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215de57ac0] mmco: unref short failure [h264 @ 0x56215de57ac0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [2024-11-28 16:37:15] iteration 4/ 500 | consumed samples: 32 | elapsed time per iteration (ms): 412089.2 | throughput per GPU (TFLOP/s/GPU): 126.3 | learning rate: 1.333333E-06 | global batch size: 8 | lm loss: 6.219399E+00 | loss scale: 1.0 | grad norm: 57.408 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [2024-11-28 16:43:37] iteration 5/ 500 | consumed samples: 40 | elapsed time per iteration (ms): 382756.5 | throughput per GPU (TFLOP/s/GPU): 136.0 | learning rate: 1.666667E-06 | global batch size: 8 | lm loss: 6.177355E+00 | loss scale: 1.0 | grad norm: 40.531 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [2024-11-28 16:49:48] iteration 6/ 500 | consumed samples: 48 | elapsed time per iteration (ms): 370317.2 | throughput per GPU (TFLOP/s/GPU): 140.5 | learning rate: 2.000000E-06 | global batch size: 8 | lm loss: 6.195541E+00 | loss scale: 1.0 | grad norm: 37.413 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [2024-11-28 16:55:57] iteration 7/ 500 | consumed samples: 56 | elapsed time per iteration (ms): 369201.8 | throughput per GPU (TFLOP/s/GPU): 141.0 | learning rate: 2.333333E-06 | global batch size: 8 | lm loss: 6.063354E+00 | loss scale: 1.0 | grad norm: 77.346 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [2024-11-28 17:02:47] iteration 8/ 500 | consumed samples: 64 | elapsed time per iteration (ms): 409604.1 | throughput per GPU (TFLOP/s/GPU): 127.1 | learning rate: 2.666667E-06 | global batch size: 8 | lm loss: 5.870256E+00 | loss scale: 1.0 | grad norm: 28.693 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure processed_samples 700 unjoint_samples 700 joint_samples 1 [252030, 979033] processed_samples 700 unjoint_samples 700 joint_samples 1 [115459, 1044928] processed_samples 700 unjoint_samples 700 joint_samples 1 [252030, 979033] processed_samples 700 unjoint_samples 700 joint_samples 1 [115459, 1044928] processed_samples 700 unjoint_samples 700 joint_samples 1 [332780, 1018922] processed_samples 700 unjoint_samples 700 joint_samples 1 [332780, 1018922] processed_samples 700 unjoint_samples 700 joint_samples 1 [344433, 1036861] processed_samples 700 unjoint_samples 700 joint_samples 1 [344433, 1036861] processed_samples 700 unjoint_samples 700 joint_samples 1 [191911, 1045441] processed_samples 700 unjoint_samples 700 joint_samples 1 [191911, 1045441] processed_samples 700 unjoint_samples 700 joint_samples 1 [643940, 1016149] processed_samples 700 unjoint_samples 700 joint_samples 1 [643940, 1016149] processed_samples 700 unjoint_samples 700 joint_samples 1 [1046784, 453302] processed_samples 700 unjoint_samples 700 joint_samples 1 [1046784, 453302] processed_samples 700 unjoint_samples 700 joint_samples 1 [992692, 517451] processed_samples 700 unjoint_samples 700 joint_samples 1 [992692, 517451] [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure processed_samples 800 unjoint_samples 800 joint_samples 1 [600026, 1018922] processed_samples 800 unjoint_samples 800 joint_samples 1 [600026, 1018922] processed_samples 800 unjoint_samples 800 joint_samples 1 [480367, 1044928] processed_samples 800 unjoint_samples 800 joint_samples 1 [480367, 1044928] processed_samples 800 unjoint_samples 800 joint_samples 1 [1046784, 733804] processed_samples 800 unjoint_samples 800 joint_samples 1 [443725, 1045441] processed_samples 800 unjoint_samples 800 joint_samples 1 [443725, 1045441] processed_samples 800 unjoint_samples 800 joint_samples 1 [980254, 1016149] processed_samples 800 unjoint_samples 800 joint_samples 1 [721471, 1036861] processed_samples 800 unjoint_samples 800 joint_samples 1 [1046784, 733804] processed_samples 800 unjoint_samples 800 joint_samples 1 [992692, 843491] processed_samples 800 unjoint_samples 800 joint_samples 1 [730050, 979033] processed_samples 800 unjoint_samples 800 joint_samples 1 [721471, 1036861] [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure processed_samples 800 unjoint_samples 800 joint_samples 1 [992692, 843491] processed_samples 800 unjoint_samples 800 joint_samples 1 [730050, 979033] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 800 unjoint_samples 800 joint_samples 1 [980254, 1016149] [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [2024-11-28 17:13:48] iteration 9/ 500 | consumed samples: 72 | elapsed time per iteration (ms): 660898.0 | throughput per GPU (TFLOP/s/GPU): 78.7 | learning rate: 3.000000E-06 | global batch size: 8 | lm loss: 5.787996E+00 | loss scale: 1.0 | grad norm: 29.784 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [2024-11-28 17:21:06] iteration 10/ 500 | consumed samples: 80 | elapsed time per iteration (ms): 438615.6 | throughput per GPU (TFLOP/s/GPU): 118.6 | learning rate: 3.333333E-06 | global batch size: 8 | lm loss: 5.712118E+00 | loss scale: 1.0 | grad norm: 43.059 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [2024-11-28 17:27:12] iteration 11/ 500 | consumed samples: 88 | elapsed time per iteration (ms): 365964.4 | throughput per GPU (TFLOP/s/GPU): 142.2 | learning rate: 3.666667E-06 | global batch size: 8 | lm loss: 5.826314E+00 | loss scale: 1.0 | grad norm: 24.661 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure processed_samples 900 unjoint_samples 900 joint_samples 1 [685603, 1044928] processed_samples 900 unjoint_samples 900 joint_samples 1 [685603, 1044928] processed_samples 900 unjoint_samples 900 joint_samples 2 [1035512, 215070] processed_samples 900 unjoint_samples 900 joint_samples 2 [1035512, 215070] processed_samples 900 unjoint_samples 900 joint_samples 2 [169160, 1045271] processed_samples 900 unjoint_samples 900 joint_samples 1 [870427, 1018922] processed_samples 900 unjoint_samples 900 joint_samples 1 [870427, 1018922] processed_samples 900 unjoint_samples 900 joint_samples 2 [169160, 1045271] processed_samples 900 unjoint_samples 900 joint_samples 2 [44304, 1042650] processed_samples 900 unjoint_samples 900 joint_samples 2 [44304, 1042650] processed_samples 900 unjoint_samples 900 joint_samples 1 [1009966, 1036861] [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure processed_samples 900 unjoint_samples 900 joint_samples 1 [1009966, 1036861] processed_samples 900 unjoint_samples 900 joint_samples 1 [1002137, 1001356] processed_samples 900 unjoint_samples 900 joint_samples 1 [769375, 1045441] processed_samples 900 unjoint_samples 900 joint_samples 1 [1002137, 1001356] processed_samples 900 unjoint_samples 900 joint_samples 1 [769375, 1045441] [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [2024-11-28 17:34:31] iteration 12/ 500 | consumed samples: 96 | elapsed time per iteration (ms): 439260.5 | throughput per GPU (TFLOP/s/GPU): 118.5 | learning rate: 4.000000E-06 | global batch size: 8 | lm loss: 5.277246E+00 | loss scale: 1.0 | grad norm: 25.843 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [2024-11-28 17:40:54] iteration 13/ 500 | consumed samples: 104 | elapsed time per iteration (ms): 382309.0 | throughput per GPU (TFLOP/s/GPU): 136.1 | learning rate: 4.333333E-06 | global batch size: 8 | lm loss: 5.065854E+00 | loss scale: 1.0 | grad norm: 19.160 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [2024-11-28 17:47:00] iteration 14/ 500 | consumed samples: 112 | elapsed time per iteration (ms): 366539.7 | throughput per GPU (TFLOP/s/GPU): 142.0 | learning rate: 4.666667E-06 | global batch size: 8 | lm loss: 4.861617E+00 | loss scale: 1.0 | grad norm: 18.576 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-28 17:54:35] iteration 15/ 500 | consumed samples: 120 | elapsed time per iteration (ms): 454227.2 | throughput per GPU (TFLOP/s/GPU): 114.6 | learning rate: 5.000000E-06 | global batch size: 8 | lm loss: 5.125950E+00 | loss scale: 1.0 | grad norm: 21.127 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-28 18:00:40] iteration 16/ 500 | consumed samples: 128 | elapsed time per iteration (ms): 365586.7 | throughput per GPU (TFLOP/s/GPU): 142.3 | learning rate: 4.999949E-06 | global batch size: 8 | lm loss: 4.770656E+00 | loss scale: 1.0 | grad norm: 16.546 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1045990, 46173] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1045990, 46173] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1029364, 115962] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1029364, 115962] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1047713, 111784] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1047713, 111784] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [84176, 1047539] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [84176, 1047539] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [389118, 1042650] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [389118, 1042650] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1035512, 460681] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1035512, 460681] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [542798, 1045271] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [542798, 1045271] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1036356, 343956] processed_samples 1000 unjoint_samples 1000 joint_samples 2 [1036356, 343956] [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215bac3ac0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1047713, 390717] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1047713, 390717] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1045990, 377675] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1029364, 428331] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1045990, 377675] [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1029364, 428331] [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure processed_samples 1100 unjoint_samples 1100 joint_samples 2 [379155, 1047539] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [379155, 1047539] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [737670, 1042650] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [737670, 1042650] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [937076, 1045271] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1036356, 690281] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [937076, 1045271] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1036356, 690281] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1035512, 745640] processed_samples 1100 unjoint_samples 1100 joint_samples 2 [1035512, 745640] [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [2024-11-28 18:11:45] iteration 17/ 500 | consumed samples: 136 | elapsed time per iteration (ms): 665218.5 | throughput per GPU (TFLOP/s/GPU): 78.2 | learning rate: 4.999794E-06 | global batch size: 8 | lm loss: 4.786225E+00 | loss scale: 1.0 | grad norm: 75.620 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1045990, 736951] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1029364, 716107] processed_samples 1200 unjoint_samples 1200 joint_samples 3 [178926, 1047160] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1045990, 736951] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1029364, 716107] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1047713, 764007] processed_samples 1200 unjoint_samples 1200 joint_samples 3 [178926, 1047160] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1036356, 1004222] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [963999, 1042650] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1047713, 764007] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1036356, 1004222] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1043823, 1042971] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [963999, 1042650] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [660021, 1047539] processed_samples 1200 unjoint_samples 1200 joint_samples 2 [1043823, 1042971] [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure processed_samples 1200 unjoint_samples 1200 joint_samples 2 [660021, 1047539] [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [2024-11-28 18:19:51] iteration 18/ 500 | consumed samples: 144 | elapsed time per iteration (ms): 485884.5 | throughput per GPU (TFLOP/s/GPU): 107.1 | learning rate: 4.999537E-06 | global batch size: 8 | lm loss: 4.659023E+00 | loss scale: 1.0 | grad norm: 14.797 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [2024-11-28 18:27:20] iteration 19/ 500 | consumed samples: 152 | elapsed time per iteration (ms): 449170.3 | throughput per GPU (TFLOP/s/GPU): 115.9 | learning rate: 4.999178E-06 | global batch size: 8 | lm loss: 4.296840E+00 | loss scale: 1.0 | grad norm: 10.513 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [2024-11-28 18:34:39] iteration 20/ 500 | consumed samples: 160 | elapsed time per iteration (ms): 438903.9 | throughput per GPU (TFLOP/s/GPU): 118.6 | learning rate: 4.998715E-06 | global batch size: 8 | lm loss: 4.165969E+00 | loss scale: 1.0 | grad norm: 8.266 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (107908.65, 107908.97) [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [2024-11-28 18:44:02] iteration 21/ 500 | consumed samples: 168 | elapsed time per iteration (ms): 454286.3 | throughput per GPU (TFLOP/s/GPU): 114.6 | learning rate: 4.998150E-06 | global batch size: 8 | lm loss: 4.101646E+00 | loss scale: 1.0 | grad norm: 8.682 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1046489, 7826] processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1046489, 7826] processed_samples 1300 unjoint_samples 1300 joint_samples 2 [1045990, 1007691] processed_samples 1300 unjoint_samples 1300 joint_samples 2 [1045990, 1007691] processed_samples 1300 unjoint_samples 1300 joint_samples 3 [481372, 1047160] processed_samples 1300 unjoint_samples 1300 joint_samples 3 [481372, 1047160] processed_samples 1300 unjoint_samples 1300 joint_samples 2 [1047713, 994410] processed_samples 1300 unjoint_samples 1300 joint_samples 2 [1047713, 994410] [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1045557, 290992] processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1045557, 290992] processed_samples 1300 unjoint_samples 1300 joint_samples 3 [4593, 1047539] processed_samples 1300 unjoint_samples 1300 joint_samples 3 [275734, 1043972] processed_samples 1300 unjoint_samples 1300 joint_samples 3 [275734, 1043972] processed_samples 1300 unjoint_samples 1300 joint_samples 3 [4593, 1047539] processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1048104, 255593] processed_samples 1300 unjoint_samples 1300 joint_samples 3 [1048104, 255593] [2024-11-28 18:49:43] iteration 22/ 500 | consumed samples: 176 | elapsed time per iteration (ms): 341280.8 | throughput per GPU (TFLOP/s/GPU): 152.5 | learning rate: 4.997482E-06 | global batch size: 8 | lm loss: 3.860502E+00 | loss scale: 1.0 | grad norm: 12.678 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [2024-11-28 18:56:32] iteration 23/ 500 | consumed samples: 184 | elapsed time per iteration (ms): 408951.4 | throughput per GPU (TFLOP/s/GPU): 127.3 | learning rate: 4.996711E-06 | global batch size: 8 | lm loss: 3.689850E+00 | loss scale: 1.0 | grad norm: 8.323 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-28 19:02:55] iteration 24/ 500 | consumed samples: 192 | elapsed time per iteration (ms): 383067.5 | throughput per GPU (TFLOP/s/GPU): 135.9 | learning rate: 4.995838E-06 | global batch size: 8 | lm loss: 3.747823E+00 | loss scale: 1.0 | grad norm: 8.168 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure processed_samples 1400 unjoint_samples 1400 joint_samples 3 [176477, 1047743] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [232404, 1047554] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1046489, 339204] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [329617, 1047539] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [176477, 1047743] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1046489, 339204] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [232404, 1047554] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [612513, 1043972] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1045557, 589201] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [329617, 1047539] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [817074, 1047160] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [612513, 1043972] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1048104, 574654] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1048104, 574654] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [1045557, 589201] processed_samples 1400 unjoint_samples 1400 joint_samples 3 [817074, 1047160] [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b95e700] mmco: unref short failure [h264 @ 0x56215b95e700] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [2024-11-28 19:13:38] iteration 25/ 500 | consumed samples: 200 | elapsed time per iteration (ms): 643017.6 | throughput per GPU (TFLOP/s/GPU): 80.9 | learning rate: 4.994862E-06 | global batch size: 8 | lm loss: 3.567746E+00 | loss scale: 1.0 | grad norm: 7.384 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 1500 unjoint_samples 1500 joint_samples 4 [15562, 1047160] processed_samples 1500 unjoint_samples 1500 joint_samples 4 [15562, 1047160] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [616066, 1047554] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1048104, 907324] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1046489, 623184] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1046489, 623184] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [616066, 1047554] [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure processed_samples 1500 unjoint_samples 1500 joint_samples 3 [908565, 1043972] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1048104, 907324] [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure processed_samples 1500 unjoint_samples 1500 joint_samples 3 [624037, 1047539] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [624037, 1047539] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [602378, 1047743] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1045557, 935157] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [1045557, 935157] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [908565, 1043972] processed_samples 1500 unjoint_samples 1500 joint_samples 3 [602378, 1047743] [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [2024-11-28 19:21:41] iteration 26/ 500 | consumed samples: 208 | elapsed time per iteration (ms): 482923.1 | throughput per GPU (TFLOP/s/GPU): 107.8 | learning rate: 4.993783E-06 | global batch size: 8 | lm loss: 3.596097E+00 | loss scale: 1.0 | grad norm: 6.934 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [2024-11-28 19:28:44] iteration 27/ 500 | consumed samples: 216 | elapsed time per iteration (ms): 422673.3 | throughput per GPU (TFLOP/s/GPU): 123.1 | learning rate: 4.992602E-06 | global batch size: 8 | lm loss: 3.274019E+00 | loss scale: 1.0 | grad norm: 6.573 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [2024-11-28 19:35:43] iteration 28/ 500 | consumed samples: 224 | elapsed time per iteration (ms): 419757.8 | throughput per GPU (TFLOP/s/GPU): 124.0 | learning rate: 4.991319E-06 | global batch size: 8 | lm loss: 3.158726E+00 | loss scale: 1.0 | grad norm: 5.771 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure processed_samples 1600 unjoint_samples 1600 joint_samples 3 [1046489, 946723] processed_samples 1600 unjoint_samples 1600 joint_samples 4 [114808, 1047945] processed_samples 1600 unjoint_samples 1600 joint_samples 4 [114808, 1047945] processed_samples 1600 unjoint_samples 1600 joint_samples 4 [286993, 1047160] processed_samples 1600 unjoint_samples 1600 joint_samples 4 [285396, 1047119] processed_samples 1600 unjoint_samples 1600 joint_samples 4 [286993, 1047160] processed_samples 1600 unjoint_samples 1600 joint_samples 4 [155550, 1042992] processed_samples 1600 unjoint_samples 1600 joint_samples 4 [285396, 1047119] processed_samples 1600 unjoint_samples 1600 joint_samples 4 [155550, 1042992] processed_samples 1600 unjoint_samples 1600 joint_samples 3 [1046489, 946723] processed_samples 1600 unjoint_samples 1600 joint_samples 3 [927098, 1047743] processed_samples 1600 unjoint_samples 1600 joint_samples 3 [1010244, 1047539] [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure processed_samples 1600 unjoint_samples 1600 joint_samples 3 [927098, 1047743] processed_samples 1600 unjoint_samples 1600 joint_samples 3 [1010244, 1047539] processed_samples 1600 unjoint_samples 1600 joint_samples 3 [917247, 1047554] processed_samples 1600 unjoint_samples 1600 joint_samples 3 [917247, 1047554] [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [2024-11-28 19:43:34] iteration 29/ 500 | consumed samples: 232 | elapsed time per iteration (ms): 470278.0 | throughput per GPU (TFLOP/s/GPU): 110.7 | learning rate: 4.989933E-06 | global batch size: 8 | lm loss: 3.036888E+00 | loss scale: 1.0 | grad norm: 16.115 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [2024-11-28 19:50:26] iteration 30/ 500 | consumed samples: 240 | elapsed time per iteration (ms): 412673.2 | throughput per GPU (TFLOP/s/GPU): 126.1 | learning rate: 4.988444E-06 | global batch size: 8 | lm loss: 2.989427E+00 | loss scale: 1.0 | grad norm: 25.443 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [2024-11-28 19:56:45] iteration 31/ 500 | consumed samples: 248 | elapsed time per iteration (ms): 378909.1 | throughput per GPU (TFLOP/s/GPU): 137.3 | learning rate: 4.986854E-06 | global batch size: 8 | lm loss: 2.768105E+00 | loss scale: 1.0 | grad norm: 12.204 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [2024-11-28 20:03:40] iteration 32/ 500 | consumed samples: 256 | elapsed time per iteration (ms): 415089.0 | throughput per GPU (TFLOP/s/GPU): 125.4 | learning rate: 4.985161E-06 | global batch size: 8 | lm loss: 2.958685E+00 | loss scale: 1.0 | grad norm: 69.520 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure processed_samples 1700 unjoint_samples 1700 joint_samples 4 [152708, 1047743] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [152708, 1047743] [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure processed_samples 1700 unjoint_samples 1700 joint_samples 4 [1046327, 286245] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [1046327, 286245] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [227378, 1046063] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [227378, 1046063] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [369421, 1047945] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [369421, 1047945] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [1042972, 142693] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [1042972, 142693] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [644299, 1047119] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [570080, 1047160] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [435158, 1042992] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [435158, 1042992] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [644299, 1047119] processed_samples 1700 unjoint_samples 1700 joint_samples 4 [570080, 1047160] [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d514f19000] mmco: unref short failure [h264 @ 0x55d514f19000] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d514f19000] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c3b4780] mmco: unref short failure [h264 @ 0x56215c3b4780] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215c3b4780] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure processed_samples 1800 unjoint_samples 1800 joint_samples 4 [577867, 1046063] [h264 @ 0x56215b9b7740] mmco: unref short failure processed_samples 1800 unjoint_samples 1800 joint_samples 4 [1042972, 527675] processed_samples 1800 unjoint_samples 1800 joint_samples 4 [429748, 1047743] [h264 @ 0x55d5194f31c0] mmco: unref short failure processed_samples 1800 unjoint_samples 1800 joint_samples 4 [708231, 1042992] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure processed_samples 1800 unjoint_samples 1800 joint_samples 4 [1046327, 547165] processed_samples 1800 unjoint_samples 1800 joint_samples 4 [627333, 1047945] processed_samples 1800 unjoint_samples 1800 joint_samples 4 [896495, 1047160] processed_samples 1800 unjoint_samples 1800 joint_samples 4 [871854, 1047119] processed_samples 1800 unjoint_samples 1800 joint_samples 4 [577867, 1046063] [h264 @ 0x55d51407fb00] mmco: unref short failure processed_samples 1800 unjoint_samples 1800 joint_samples 4 [1042972, 527675] [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure processed_samples 1800 unjoint_samples 1800 joint_samples 4 [429748, 1047743] [h264 @ 0x56215f845e80] mmco: unref short failure processed_samples 1800 unjoint_samples 1800 joint_samples 4 [708231, 1042992] [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure processed_samples 1800 unjoint_samples 1800 joint_samples 4 [896495, 1047160] processed_samples 1800 unjoint_samples 1800 joint_samples 4 [1046327, 547165] processed_samples 1800 unjoint_samples 1800 joint_samples 4 [627333, 1047945] [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure processed_samples 1800 unjoint_samples 1800 joint_samples 4 [871854, 1047119] [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d512ccf0c0] mmco: unref short failure [h264 @ 0x55d512ccf0c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [2024-11-28 20:14:30] iteration 33/ 500 | consumed samples: 264 | elapsed time per iteration (ms): 649242.9 | throughput per GPU (TFLOP/s/GPU): 80.2 | learning rate: 4.983366E-06 | global batch size: 8 | lm loss: 2.799433E+00 | loss scale: 1.0 | grad norm: 8.850 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-28 20:23:00] iteration 34/ 500 | consumed samples: 272 | elapsed time per iteration (ms): 510659.3 | throughput per GPU (TFLOP/s/GPU): 101.9 | learning rate: 4.981468E-06 | global batch size: 8 | lm loss: 2.573955E+00 | loss scale: 1.0 | grad norm: 9.165 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215e8edc00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215e8edc00] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure processed_samples 1900 unjoint_samples 1900 joint_samples 5 [273341, 1047160] processed_samples 1900 unjoint_samples 1900 joint_samples 5 [105106, 1047119] processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1042972, 818526] processed_samples 1900 unjoint_samples 1900 joint_samples 4 [743031, 1047743] processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1039483, 1042992] [h264 @ 0x55d51723b300] mmco: unref short failure processed_samples 1900 unjoint_samples 1900 joint_samples 4 [862930, 1046063] [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1046327, 1006557] [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1031627, 1047945] [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure processed_samples 1900 unjoint_samples 1900 joint_samples 5 [273341, 1047160] processed_samples 1900 unjoint_samples 1900 joint_samples 4 [743031, 1047743] processed_samples 1900 unjoint_samples 1900 joint_samples 5 [105106, 1047119] [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1042972, 818526] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 1900 unjoint_samples 1900 joint_samples 4 [862930, 1046063] processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1039483, 1042992] [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1046327, 1006557] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 1900 unjoint_samples 1900 joint_samples 4 [1031627, 1047945] [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [2024-11-28 20:30:33] iteration 35/ 500 | consumed samples: 280 | elapsed time per iteration (ms): 452777.2 | throughput per GPU (TFLOP/s/GPU): 114.9 | learning rate: 4.979469E-06 | global batch size: 8 | lm loss: 2.710365E+00 | loss scale: 1.0 | grad norm: 8.172 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-28 20:38:40] iteration 36/ 500 | consumed samples: 288 | elapsed time per iteration (ms): 486962.5 | throughput per GPU (TFLOP/s/GPU): 106.9 | learning rate: 4.977368E-06 | global batch size: 8 | lm loss: 2.466654E+00 | loss scale: 1.0 | grad norm: 8.771 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [2024-11-28 20:45:53] iteration 37/ 500 | consumed samples: 296 | elapsed time per iteration (ms): 433240.8 | throughput per GPU (TFLOP/s/GPU): 120.1 | learning rate: 4.975165E-06 | global batch size: 8 | lm loss: 2.457261E+00 | loss scale: 1.0 | grad norm: 6.649 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [2024-11-28 20:52:30] iteration 38/ 500 | consumed samples: 304 | elapsed time per iteration (ms): 396727.3 | throughput per GPU (TFLOP/s/GPU): 131.2 | learning rate: 4.972860E-06 | global batch size: 8 | lm loss: 2.120668E+00 | loss scale: 1.0 | grad norm: 9.612 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [2024-11-28 20:59:15] iteration 39/ 500 | consumed samples: 312 | elapsed time per iteration (ms): 404869.9 | throughput per GPU (TFLOP/s/GPU): 128.5 | learning rate: 4.970454E-06 | global batch size: 8 | lm loss: 2.032905E+00 | loss scale: 1.0 | grad norm: 6.617 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c19aa80] [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1025880, 172231] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1045633, 404501] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1047005, 247576] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [614626, 1047160] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1046427, 219041] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [82963, 1044981] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [389277, 1047119] processed_samples 2000 unjoint_samples 2000 joint_samples 4 [1025173, 1047743] [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1025880, 172231] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1045633, 404501] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1047005, 247576] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [82963, 1044981] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [614626, 1047160] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [1046427, 219041] processed_samples 2000 unjoint_samples 2000 joint_samples 5 [389277, 1047119] processed_samples 2000 unjoint_samples 2000 joint_samples 4 [1025173, 1047743] [2024-11-28 21:07:02] iteration 40/ 500 | consumed samples: 320 | elapsed time per iteration (ms): 467126.8 | throughput per GPU (TFLOP/s/GPU): 111.4 | learning rate: 4.967946E-06 | global batch size: 8 | lm loss: 2.002694E+00 | loss scale: 1.0 | grad norm: 8.476 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (230397.23, 230397.69) [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215e692c80] mmco: unref short failure [h264 @ 0x56215e692c80] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 2100 unjoint_samples 2100 joint_samples 5 [417457, 1044981] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1025880, 494031] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1029293, 416295] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1047005, 733854] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1046427, 573043] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1025880, 494031] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [417457, 1044981] [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1045633, 793613] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1029293, 416295] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [678301, 1047119] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [924889, 1047160] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1047005, 733854] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1046427, 573043] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [924889, 1047160] [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure processed_samples 2100 unjoint_samples 2100 joint_samples 5 [1045633, 793613] processed_samples 2100 unjoint_samples 2100 joint_samples 5 [678301, 1047119] [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [2024-11-28 21:20:28] iteration 41/ 500 | consumed samples: 328 | elapsed time per iteration (ms): 575913.1 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 4.965337E-06 | global batch size: 8 | lm loss: 2.023730E+00 | loss scale: 1.0 | grad norm: 7.992 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [2024-11-28 21:27:41] iteration 42/ 500 | consumed samples: 336 | elapsed time per iteration (ms): 432358.6 | throughput per GPU (TFLOP/s/GPU): 120.4 | learning rate: 4.962626E-06 | global batch size: 8 | lm loss: 1.950212E+00 | loss scale: 1.0 | grad norm: 10.135 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51325e000] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [2024-11-28 21:35:07] iteration 43/ 500 | consumed samples: 344 | elapsed time per iteration (ms): 446286.7 | throughput per GPU (TFLOP/s/GPU): 116.6 | learning rate: 4.959814E-06 | global batch size: 8 | lm loss: 1.985991E+00 | loss scale: 1.0 | grad norm: 8.730 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 2200 unjoint_samples 2200 joint_samples 6 [64780, 1022454] processed_samples 2200 unjoint_samples 2200 joint_samples 6 [153094, 1047160] processed_samples 2200 unjoint_samples 2200 joint_samples 6 [153094, 1047160] processed_samples 2200 unjoint_samples 2200 joint_samples 6 [64780, 1022454] processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1029293, 760834] processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1025880, 953414] processed_samples 2200 unjoint_samples 2200 joint_samples 6 [17659, 1045609] processed_samples 2200 unjoint_samples 2200 joint_samples 6 [17659, 1045609] processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1046427, 997952] processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1029293, 760834] processed_samples 2200 unjoint_samples 2200 joint_samples 5 [680997, 1044981] processed_samples 2200 unjoint_samples 2200 joint_samples 5 [680997, 1044981] processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1025880, 953414] processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1025348, 1047119] processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1025348, 1047119] processed_samples 2200 unjoint_samples 2200 joint_samples 5 [1046427, 997952] [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [2024-11-28 21:42:48] iteration 44/ 500 | consumed samples: 352 | elapsed time per iteration (ms): 460424.0 | throughput per GPU (TFLOP/s/GPU): 113.0 | learning rate: 4.956901E-06 | global batch size: 8 | lm loss: 1.866224E+00 | loss scale: 1.0 | grad norm: 7.184 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-28 21:49:34] iteration 45/ 500 | consumed samples: 360 | elapsed time per iteration (ms): 406346.7 | throughput per GPU (TFLOP/s/GPU): 128.1 | learning rate: 4.953887E-06 | global batch size: 8 | lm loss: 1.858991E+00 | loss scale: 1.0 | grad norm: 10.926 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [2024-11-28 21:56:53] iteration 46/ 500 | consumed samples: 368 | elapsed time per iteration (ms): 439465.9 | throughput per GPU (TFLOP/s/GPU): 118.4 | learning rate: 4.950772E-06 | global batch size: 8 | lm loss: 1.825678E+00 | loss scale: 1.0 | grad norm: 9.273 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure processed_samples 2300 unjoint_samples 2300 joint_samples 6 [168460, 1021869] processed_samples 2300 unjoint_samples 2300 joint_samples 6 [418718, 1047160] processed_samples 2300 unjoint_samples 2300 joint_samples 6 [299051, 1046402] processed_samples 2300 unjoint_samples 2300 joint_samples 6 [328027, 1045609] processed_samples 2300 unjoint_samples 2300 joint_samples 6 [384015, 1022454] processed_samples 2300 unjoint_samples 2300 joint_samples 5 [1029293, 999533] processed_samples 2300 unjoint_samples 2300 joint_samples 6 [1046773, 296268] processed_samples 2300 unjoint_samples 2300 joint_samples 6 [168460, 1021869] processed_samples 2300 unjoint_samples 2300 joint_samples 5 [997435, 1044981] [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure processed_samples 2300 unjoint_samples 2300 joint_samples 5 [997435, 1044981] [h264 @ 0x55d514594d00] mmco: unref short failure processed_samples 2300 unjoint_samples 2300 joint_samples 6 [299051, 1046402] processed_samples 2300 unjoint_samples 2300 joint_samples 6 [328027, 1045609] processed_samples 2300 unjoint_samples 2300 joint_samples 6 [1046773, 296268] processed_samples 2300 unjoint_samples 2300 joint_samples 6 [384015, 1022454] processed_samples 2300 unjoint_samples 2300 joint_samples 6 [418718, 1047160] processed_samples 2300 unjoint_samples 2300 joint_samples 5 [1029293, 999533] [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-11-28 22:04:56] iteration 47/ 500 | consumed samples: 376 | elapsed time per iteration (ms): 482488.9 | throughput per GPU (TFLOP/s/GPU): 107.9 | learning rate: 4.947556E-06 | global batch size: 8 | lm loss: 1.775909E+00 | loss scale: 1.0 | grad norm: 10.026 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [2024-11-28 22:13:24] iteration 48/ 500 | consumed samples: 384 | elapsed time per iteration (ms): 508396.6 | throughput per GPU (TFLOP/s/GPU): 102.4 | learning rate: 4.944240E-06 | global batch size: 8 | lm loss: 1.686098E+00 | loss scale: 1.0 | grad norm: 8.427 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure processed_samples 2400 unjoint_samples 2400 joint_samples 6 [214219, 1046833] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [531637, 1021869] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [1041184, 189346] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [662814, 1045609] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [1046773, 620573] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [717427, 1022454] [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure processed_samples 2400 unjoint_samples 2400 joint_samples 6 [562488, 1046402] [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure processed_samples 2400 unjoint_samples 2400 joint_samples 6 [779201, 1047160] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [531637, 1021869] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [214219, 1046833] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [1041184, 189346] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [779201, 1047160] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [662814, 1045609] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [1046773, 620573] [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 2400 unjoint_samples 2400 joint_samples 6 [562488, 1046402] processed_samples 2400 unjoint_samples 2400 joint_samples 6 [717427, 1022454] [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [2024-11-28 22:22:49] iteration 49/ 500 | consumed samples: 392 | elapsed time per iteration (ms): 565072.6 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 4.940823E-06 | global batch size: 8 | lm loss: 1.693457E+00 | loss scale: 1.0 | grad norm: 8.878 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [2024-11-28 22:33:36] iteration 50/ 500 | consumed samples: 400 | elapsed time per iteration (ms): 646385.2 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 4.937306E-06 | global batch size: 8 | lm loss: 1.632946E+00 | loss scale: 1.0 | grad norm: 9.775 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure processed_samples 2500 unjoint_samples 2500 joint_samples 6 [963034, 1045609] processed_samples 2500 unjoint_samples 2500 joint_samples 6 [1041184, 484451] processed_samples 2500 unjoint_samples 2500 joint_samples 7 [1040035, 152091] processed_samples 2500 unjoint_samples 2500 joint_samples 6 [963034, 1045609] processed_samples 2500 unjoint_samples 2500 joint_samples 7 [1040035, 152091] [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure processed_samples 2500 unjoint_samples 2500 joint_samples 6 [1041184, 484451] processed_samples 2500 unjoint_samples 2500 joint_samples 7 [1037182, 27542] processed_samples 2500 unjoint_samples 2500 joint_samples 7 [1037182, 27542] processed_samples 2500 unjoint_samples 2500 joint_samples 6 [898543, 1046402] processed_samples 2500 unjoint_samples 2500 joint_samples 6 [653431, 1046833] processed_samples 2500 unjoint_samples 2500 joint_samples 6 [653431, 1046833] [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure processed_samples 2500 unjoint_samples 2500 joint_samples 6 [1046773, 1005163] processed_samples 2500 unjoint_samples 2500 joint_samples 6 [780576, 1021869] processed_samples 2500 unjoint_samples 2500 joint_samples 6 [780576, 1021869] processed_samples 2500 unjoint_samples 2500 joint_samples 6 [898543, 1046402] processed_samples 2500 unjoint_samples 2500 joint_samples 6 [1046773, 1005163] [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [2024-11-28 22:43:43] iteration 51/ 500 | consumed samples: 408 | elapsed time per iteration (ms): 606950.8 | throughput per GPU (TFLOP/s/GPU): 85.7 | learning rate: 4.933689E-06 | global batch size: 8 | lm loss: 1.538779E+00 | loss scale: 1.0 | grad norm: 6.345 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [2024-11-28 22:52:19] iteration 52/ 500 | consumed samples: 416 | elapsed time per iteration (ms): 516548.8 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 4.929971E-06 | global batch size: 8 | lm loss: 1.636017E+00 | loss scale: 1.0 | grad norm: 6.369 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [2024-11-28 22:58:46] iteration 53/ 500 | consumed samples: 424 | elapsed time per iteration (ms): 386918.3 | throughput per GPU (TFLOP/s/GPU): 134.5 | learning rate: 4.926154E-06 | global batch size: 8 | lm loss: 1.564707E+00 | loss scale: 1.0 | grad norm: 6.224 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [2024-11-28 23:06:07] iteration 54/ 500 | consumed samples: 432 | elapsed time per iteration (ms): 441009.5 | throughput per GPU (TFLOP/s/GPU): 118.0 | learning rate: 4.922237E-06 | global batch size: 8 | lm loss: 1.536033E+00 | loss scale: 1.0 | grad norm: 14.530 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1030392, 45401] [h264 @ 0x56215b75cc40] mmco: unref short failure processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1037182, 451076] processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1033539, 202907] processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1030392, 45401] processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1047495, 231869] processed_samples 2600 unjoint_samples 2600 joint_samples 7 [218226, 1046760] processed_samples 2600 unjoint_samples 2600 joint_samples 6 [1041184, 749174] [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1040035, 418957] processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1037182, 451076] processed_samples 2600 unjoint_samples 2600 joint_samples 6 [920232, 1046833] processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1033539, 202907] processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1047495, 231869] [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure processed_samples 2600 unjoint_samples 2600 joint_samples 7 [218226, 1046760] processed_samples 2600 unjoint_samples 2600 joint_samples 6 [1041184, 749174] processed_samples 2600 unjoint_samples 2600 joint_samples 7 [1040035, 418957] processed_samples 2600 unjoint_samples 2600 joint_samples 6 [920232, 1046833] [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [2024-11-28 23:13:26] iteration 55/ 500 | consumed samples: 440 | elapsed time per iteration (ms): 439133.8 | throughput per GPU (TFLOP/s/GPU): 118.5 | learning rate: 4.918221E-06 | global batch size: 8 | lm loss: 1.505980E+00 | loss scale: 1.0 | grad norm: 7.839 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [2024-11-28 23:21:09] iteration 56/ 500 | consumed samples: 448 | elapsed time per iteration (ms): 463070.7 | throughput per GPU (TFLOP/s/GPU): 112.4 | learning rate: 4.914105E-06 | global batch size: 8 | lm loss: 1.489059E+00 | loss scale: 1.0 | grad norm: 5.188 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1030392, 346973] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [182480, 1023531] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [181982, 1046833] [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1047495, 577059] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1030392, 346973] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [182480, 1023531] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [181982, 1046833] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [501986, 1046760] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1047495, 577059] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [501986, 1046760] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1033539, 488905] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1037182, 785261] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1033539, 488905] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1040035, 635866] processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1040035, 635866] [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 2700 unjoint_samples 2700 joint_samples 7 [1037182, 785261] [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [2024-11-28 23:30:13] iteration 57/ 500 | consumed samples: 456 | elapsed time per iteration (ms): 543396.8 | throughput per GPU (TFLOP/s/GPU): 95.8 | learning rate: 4.909890E-06 | global batch size: 8 | lm loss: 1.563977E+00 | loss scale: 1.0 | grad norm: 8.261 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 2800 unjoint_samples 2800 joint_samples 7 [458308, 1046833] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [458308, 1046833] processed_samples 2800 unjoint_samples 2800 joint_samples 8 [10513, 1046053] processed_samples 2800 unjoint_samples 2800 joint_samples 8 [10513, 1046053] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1040035, 936194] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1040035, 936194] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1030392, 694809] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1030392, 694809] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [762415, 1046760] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [762415, 1046760] [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1047495, 905613] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1047495, 905613] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [542048, 1023531] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [542048, 1023531] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1033539, 903348] processed_samples 2800 unjoint_samples 2800 joint_samples 7 [1033539, 903348] [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [2024-11-28 23:38:26] iteration 58/ 500 | consumed samples: 464 | elapsed time per iteration (ms): 493526.9 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 4.905577E-06 | global batch size: 8 | lm loss: 1.370810E+00 | loss scale: 1.0 | grad norm: 4.837 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [2024-11-28 23:48:14] iteration 59/ 500 | consumed samples: 472 | elapsed time per iteration (ms): 587853.1 | throughput per GPU (TFLOP/s/GPU): 88.5 | learning rate: 4.901164E-06 | global batch size: 8 | lm loss: 1.455521E+00 | loss scale: 1.0 | grad norm: 3.914 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [2024-11-28 23:56:10] iteration 60/ 500 | consumed samples: 480 | elapsed time per iteration (ms): 475241.1 | throughput per GPU (TFLOP/s/GPU): 109.5 | learning rate: 4.896652E-06 | global batch size: 8 | lm loss: 1.443850E+00 | loss scale: 1.0 | grad norm: 3.787 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (225408.87, 225409.30) [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure processed_samples 2900 unjoint_samples 2900 joint_samples 8 [183546, 1039187] processed_samples 2900 unjoint_samples 2900 joint_samples 7 [912879, 1023531] processed_samples 2900 unjoint_samples 2900 joint_samples 8 [192338, 1036659] processed_samples 2900 unjoint_samples 2900 joint_samples 7 [1004522, 1046760] processed_samples 2900 unjoint_samples 2900 joint_samples 8 [159690, 1030365] processed_samples 2900 unjoint_samples 2900 joint_samples 7 [820339, 1046833] processed_samples 2900 unjoint_samples 2900 joint_samples 8 [183546, 1039187] processed_samples 2900 unjoint_samples 2900 joint_samples 8 [266009, 1046053] processed_samples 2900 unjoint_samples 2900 joint_samples 7 [912879, 1023531] processed_samples 2900 unjoint_samples 2900 joint_samples 7 [1030392, 1025591] processed_samples 2900 unjoint_samples 2900 joint_samples 7 [820339, 1046833] processed_samples 2900 unjoint_samples 2900 joint_samples 8 [192338, 1036659] processed_samples 2900 unjoint_samples 2900 joint_samples 7 [1004522, 1046760] processed_samples 2900 unjoint_samples 2900 joint_samples 8 [159690, 1030365] processed_samples 2900 unjoint_samples 2900 joint_samples 7 [1030392, 1025591] processed_samples 2900 unjoint_samples 2900 joint_samples 8 [266009, 1046053] [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [2024-11-29 00:07:26] iteration 61/ 500 | consumed samples: 488 | elapsed time per iteration (ms): 450783.8 | throughput per GPU (TFLOP/s/GPU): 115.4 | learning rate: 4.892043E-06 | global batch size: 8 | lm loss: 1.438554E+00 | loss scale: 1.0 | grad norm: 11.352 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [2024-11-29 00:14:31] iteration 62/ 500 | consumed samples: 496 | elapsed time per iteration (ms): 425191.4 | throughput per GPU (TFLOP/s/GPU): 122.4 | learning rate: 4.887334E-06 | global batch size: 8 | lm loss: 1.420748E+00 | loss scale: 1.0 | grad norm: 4.017 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [2024-11-29 00:22:50] iteration 63/ 500 | consumed samples: 504 | elapsed time per iteration (ms): 498935.6 | throughput per GPU (TFLOP/s/GPU): 104.3 | learning rate: 4.882528E-06 | global batch size: 8 | lm loss: 1.325493E+00 | loss scale: 1.0 | grad norm: 2.921 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [2024-11-29 00:30:35] iteration 64/ 500 | consumed samples: 512 | elapsed time per iteration (ms): 465537.3 | throughput per GPU (TFLOP/s/GPU): 111.8 | learning rate: 4.877624E-06 | global batch size: 8 | lm loss: 1.342766E+00 | loss scale: 1.0 | grad norm: 2.684 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d516ee2340] mmco: unref short failure processed_samples 3000 unjoint_samples 3000 joint_samples 8 [209876, 1047479] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1038574, 108667] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [467679, 1039187] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1043814, 238270] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [442152, 1030365] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [562463, 1046053] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1032263, 156247] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [579221, 1036659] [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1043814, 238270] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1038574, 108667] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [209876, 1047479] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [1032263, 156247] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [442152, 1030365] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [467679, 1039187] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [562463, 1046053] processed_samples 3000 unjoint_samples 3000 joint_samples 8 [579221, 1036659] [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d516b664c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d516b664c0] mmco: unref short failure processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1038574, 509673] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [470770, 1047479] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1032263, 503308] [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1043814, 597379] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [894532, 1036659] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [893361, 1046053] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [825304, 1039187] [h264 @ 0x55d514dfd500] mmco: unref short failure processed_samples 3100 unjoint_samples 3100 joint_samples 8 [780811, 1030365] [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1038574, 509673] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [470770, 1047479] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1032263, 503308] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [825304, 1039187] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [1043814, 597379] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [893361, 1046053] [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure processed_samples 3100 unjoint_samples 3100 joint_samples 8 [780811, 1030365] processed_samples 3100 unjoint_samples 3100 joint_samples 8 [894532, 1036659] [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [2024-11-29 00:41:20] iteration 65/ 500 | consumed samples: 520 | elapsed time per iteration (ms): 644551.8 | throughput per GPU (TFLOP/s/GPU): 80.7 | learning rate: 4.872622E-06 | global batch size: 8 | lm loss: 1.464056E+00 | loss scale: 1.0 | grad norm: 3.641 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 00:49:29] iteration 66/ 500 | consumed samples: 528 | elapsed time per iteration (ms): 489095.8 | throughput per GPU (TFLOP/s/GPU): 106.4 | learning rate: 4.867523E-06 | global batch size: 8 | lm loss: 1.302190E+00 | loss scale: 1.0 | grad norm: 3.139 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 00:58:04] iteration 67/ 500 | consumed samples: 536 | elapsed time per iteration (ms): 514950.5 | throughput per GPU (TFLOP/s/GPU): 101.1 | learning rate: 4.862327E-06 | global batch size: 8 | lm loss: 1.271642E+00 | loss scale: 1.0 | grad norm: 4.161 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [2024-11-29 01:06:04] iteration 68/ 500 | consumed samples: 544 | elapsed time per iteration (ms): 480233.0 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 4.857033E-06 | global batch size: 8 | lm loss: 1.318385E+00 | loss scale: 1.0 | grad norm: 3.545 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1046643, 153549] processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1038574, 944139] [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1043814, 892001] processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1033502, 182363] processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1032263, 770555] processed_samples 3200 unjoint_samples 3200 joint_samples 9 [154998, 1032569] processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1047130, 373515] processed_samples 3200 unjoint_samples 3200 joint_samples 8 [851493, 1047479] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1046643, 153549] processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1038574, 944139] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1043814, 892001] [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1033502, 182363] processed_samples 3200 unjoint_samples 3200 joint_samples 9 [154998, 1032569] processed_samples 3200 unjoint_samples 3200 joint_samples 8 [1032263, 770555] processed_samples 3200 unjoint_samples 3200 joint_samples 9 [1047130, 373515] [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure processed_samples 3200 unjoint_samples 3200 joint_samples 8 [851493, 1047479] [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [2024-11-29 01:14:21] iteration 69/ 500 | consumed samples: 552 | elapsed time per iteration (ms): 496961.5 | throughput per GPU (TFLOP/s/GPU): 104.7 | learning rate: 4.851643E-06 | global batch size: 8 | lm loss: 1.259558E+00 | loss scale: 1.0 | grad norm: 2.552 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51a2c2780] mmco: unref short failure [h264 @ 0x55d51a2c2780] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [2024-11-29 01:22:13] iteration 70/ 500 | consumed samples: 560 | elapsed time per iteration (ms): 471204.7 | throughput per GPU (TFLOP/s/GPU): 110.4 | learning rate: 4.846156E-06 | global batch size: 8 | lm loss: 1.274312E+00 | loss scale: 1.0 | grad norm: 2.886 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [2024-11-29 01:29:27] iteration 71/ 500 | consumed samples: 568 | elapsed time per iteration (ms): 434341.8 | throughput per GPU (TFLOP/s/GPU): 119.8 | learning rate: 4.840573E-06 | global batch size: 8 | lm loss: 1.292556E+00 | loss scale: 1.0 | grad norm: 4.098 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046506, 84077] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1044719, 190027] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046590, 224059] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [400124, 1032569] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1033502, 425149] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046643, 487770] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1047130, 726736] processed_samples 3300 unjoint_samples 3300 joint_samples 8 [1044563, 1042250] [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046506, 84077] [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1044719, 190027] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046590, 224059] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1033502, 425149] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1046643, 487770] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [400124, 1032569] processed_samples 3300 unjoint_samples 3300 joint_samples 9 [1047130, 726736] processed_samples 3300 unjoint_samples 3300 joint_samples 8 [1044563, 1042250] [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [2024-11-29 01:38:44] iteration 72/ 500 | consumed samples: 576 | elapsed time per iteration (ms): 556913.5 | throughput per GPU (TFLOP/s/GPU): 93.4 | learning rate: 4.834894E-06 | global batch size: 8 | lm loss: 1.166198E+00 | loss scale: 1.0 | grad norm: 2.673 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [2024-11-29 01:47:56] iteration 73/ 500 | consumed samples: 584 | elapsed time per iteration (ms): 552456.3 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 4.829119E-06 | global batch size: 8 | lm loss: 1.214278E+00 | loss scale: 1.0 | grad norm: 2.226 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1033502, 756251] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1033502, 756251] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046590, 456265] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1044719, 601555] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1044719, 601555] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046742, 222638] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046590, 456265] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046506, 428089] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046742, 222638] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046506, 428089] processed_samples 3400 unjoint_samples 3400 joint_samples 10 [655391, 993746] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046643, 969469] [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure processed_samples 3400 unjoint_samples 3400 joint_samples 10 [655391, 993746] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [1046643, 969469] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [679497, 1032569] processed_samples 3400 unjoint_samples 3400 joint_samples 9 [679497, 1032569] [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [2024-11-29 01:56:04] iteration 74/ 500 | consumed samples: 592 | elapsed time per iteration (ms): 487748.1 | throughput per GPU (TFLOP/s/GPU): 106.7 | learning rate: 4.823248E-06 | global batch size: 8 | lm loss: 1.274319E+00 | loss scale: 1.0 | grad norm: 3.009 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [2024-11-29 02:04:34] iteration 75/ 500 | consumed samples: 600 | elapsed time per iteration (ms): 509804.4 | throughput per GPU (TFLOP/s/GPU): 102.1 | learning rate: 4.817282E-06 | global batch size: 8 | lm loss: 1.160900E+00 | loss scale: 1.0 | grad norm: 1.811 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1033502, 1044301] processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046590, 765565] processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1033502, 1044301] processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046590, 765565] processed_samples 3500 unjoint_samples 3500 joint_samples 10 [1047244, 10668] processed_samples 3500 unjoint_samples 3500 joint_samples 10 [1047244, 10668] processed_samples 3500 unjoint_samples 3500 joint_samples 10 [223735, 1047330] processed_samples 3500 unjoint_samples 3500 joint_samples 10 [223735, 1047330] processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046742, 545020] processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046742, 545020] processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046506, 752241] processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1046506, 752241] [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure processed_samples 3500 unjoint_samples 3500 joint_samples 10 [933465, 993746] processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1034621, 1034829] processed_samples 3500 unjoint_samples 3500 joint_samples 10 [933465, 993746] processed_samples 3500 unjoint_samples 3500 joint_samples 9 [1034621, 1034829] [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [2024-11-29 02:14:25] iteration 76/ 500 | consumed samples: 608 | elapsed time per iteration (ms): 591255.1 | throughput per GPU (TFLOP/s/GPU): 88.0 | learning rate: 4.811221E-06 | global batch size: 8 | lm loss: 1.213377E+00 | loss scale: 1.0 | grad norm: 1.804 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [2024-11-29 02:21:37] iteration 77/ 500 | consumed samples: 616 | elapsed time per iteration (ms): 431698.0 | throughput per GPU (TFLOP/s/GPU): 120.5 | learning rate: 4.805065E-06 | global batch size: 8 | lm loss: 1.169552E+00 | loss scale: 1.0 | grad norm: 2.096 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [2024-11-29 02:29:05] iteration 78/ 500 | consumed samples: 624 | elapsed time per iteration (ms): 448587.5 | throughput per GPU (TFLOP/s/GPU): 116.0 | learning rate: 4.798814E-06 | global batch size: 8 | lm loss: 1.225845E+00 | loss scale: 1.0 | grad norm: 1.673 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [2024-11-29 02:37:14] iteration 79/ 500 | consumed samples: 632 | elapsed time per iteration (ms): 488178.4 | throughput per GPU (TFLOP/s/GPU): 106.6 | learning rate: 4.792469E-06 | global batch size: 8 | lm loss: 1.162172E+00 | loss scale: 1.0 | grad norm: 1.446 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1046590, 15967] [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1046590, 15967] [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure processed_samples 3600 unjoint_samples 3600 joint_samples 10 [299263, 1045384] processed_samples 3600 unjoint_samples 3600 joint_samples 11 [1037619, 181752] processed_samples 3600 unjoint_samples 3600 joint_samples 10 [299263, 1045384] processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1044335, 332165] processed_samples 3600 unjoint_samples 3600 joint_samples 9 [1046506, 1042136] processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1047244, 305151] processed_samples 3600 unjoint_samples 3600 joint_samples 11 [1037619, 181752] processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1044335, 332165] processed_samples 3600 unjoint_samples 3600 joint_samples 9 [1046742, 769797] processed_samples 3600 unjoint_samples 3600 joint_samples 9 [1046742, 769797] processed_samples 3600 unjoint_samples 3600 joint_samples 9 [1046506, 1042136] processed_samples 3600 unjoint_samples 3600 joint_samples 10 [1047244, 305151] processed_samples 3600 unjoint_samples 3600 joint_samples 10 [565551, 1047330] processed_samples 3600 unjoint_samples 3600 joint_samples 10 [565551, 1047330] [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [2024-11-29 02:46:17] iteration 80/ 500 | consumed samples: 640 | elapsed time per iteration (ms): 543840.7 | throughput per GPU (TFLOP/s/GPU): 95.7 | learning rate: 4.786030E-06 | global batch size: 8 | lm loss: 1.119773E+00 | loss scale: 1.0 | grad norm: 1.404 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (245412.40, 245413.07) [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure processed_samples 3700 unjoint_samples 3700 joint_samples 10 [216677, 1044651] [h264 @ 0x55d5145a1340] mmco: unref short failure processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1046590, 334641] processed_samples 3700 unjoint_samples 3700 joint_samples 9 [1046742, 1013845] processed_samples 3700 unjoint_samples 3700 joint_samples 10 [608115, 1045384] processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1044335, 574437] processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1047244, 607285] processed_samples 3700 unjoint_samples 3700 joint_samples 11 [1037619, 632873] processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1011196, 1047330] [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure processed_samples 3700 unjoint_samples 3700 joint_samples 10 [216677, 1044651] processed_samples 3700 unjoint_samples 3700 joint_samples 9 [1046742, 1013845] processed_samples 3700 unjoint_samples 3700 joint_samples 10 [608115, 1045384] processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1046590, 334641] processed_samples 3700 unjoint_samples 3700 joint_samples 11 [1037619, 632873] processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1044335, 574437] processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1047244, 607285] processed_samples 3700 unjoint_samples 3700 joint_samples 10 [1011196, 1047330] [2024-11-29 03:00:11] iteration 81/ 500 | consumed samples: 648 | elapsed time per iteration (ms): 587900.4 | throughput per GPU (TFLOP/s/GPU): 88.5 | learning rate: 4.779497E-06 | global batch size: 8 | lm loss: 1.176208E+00 | loss scale: 1.0 | grad norm: 1.442 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 03:06:57] iteration 82/ 500 | consumed samples: 656 | elapsed time per iteration (ms): 406422.6 | throughput per GPU (TFLOP/s/GPU): 128.0 | learning rate: 4.772870E-06 | global batch size: 8 | lm loss: 1.140343E+00 | loss scale: 1.0 | grad norm: 1.479 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d514af6580] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1047244, 929345] processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1046790, 390463] processed_samples 3800 unjoint_samples 3800 joint_samples 11 [242157, 1047330] processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1046590, 621579] processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1044335, 832002] processed_samples 3800 unjoint_samples 3800 joint_samples 10 [606617, 1044651] [h264 @ 0x56215b17e600] mmco: unref short failure processed_samples 3800 unjoint_samples 3800 joint_samples 10 [910276, 1045384] [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure processed_samples 3800 unjoint_samples 3800 joint_samples 11 [1037619, 908765] [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1047244, 929345] [h264 @ 0x55d4f080e440] mmco: unref short failure processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1046790, 390463] processed_samples 3800 unjoint_samples 3800 joint_samples 11 [242157, 1047330] processed_samples 3800 unjoint_samples 3800 joint_samples 10 [606617, 1044651] processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1046590, 621579] processed_samples 3800 unjoint_samples 3800 joint_samples 10 [1044335, 832002] [h264 @ 0x55d514409540] mmco: unref short failure processed_samples 3800 unjoint_samples 3800 joint_samples 10 [910276, 1045384] [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure processed_samples 3800 unjoint_samples 3800 joint_samples 11 [1037619, 908765] [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [2024-11-29 03:17:00] iteration 83/ 500 | consumed samples: 664 | elapsed time per iteration (ms): 603008.3 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 4.766150E-06 | global batch size: 8 | lm loss: 1.124429E+00 | loss scale: 1.0 | grad norm: 1.974 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [2024-11-29 03:25:11] iteration 84/ 500 | consumed samples: 672 | elapsed time per iteration (ms): 490715.5 | throughput per GPU (TFLOP/s/GPU): 106.1 | learning rate: 4.759337E-06 | global batch size: 8 | lm loss: 1.055463E+00 | loss scale: 1.0 | grad norm: 1.717 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [2024-11-29 03:33:25] iteration 85/ 500 | consumed samples: 680 | elapsed time per iteration (ms): 494020.4 | throughput per GPU (TFLOP/s/GPU): 105.3 | learning rate: 4.752432E-06 | global batch size: 8 | lm loss: 1.164560E+00 | loss scale: 1.0 | grad norm: 2.110 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 03:40:54] iteration 86/ 500 | consumed samples: 688 | elapsed time per iteration (ms): 448621.9 | throughput per GPU (TFLOP/s/GPU): 116.0 | learning rate: 4.745434E-06 | global batch size: 8 | lm loss: 1.063381E+00 | loss scale: 1.0 | grad norm: 1.811 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure processed_samples 3900 unjoint_samples 3900 joint_samples 11 [244902, 1046490] processed_samples 3900 unjoint_samples 3900 joint_samples 11 [244902, 1046490] processed_samples 3900 unjoint_samples 3900 joint_samples 10 [805089, 1044651] processed_samples 3900 unjoint_samples 3900 joint_samples 10 [805089, 1044651] processed_samples 3900 unjoint_samples 3900 joint_samples 10 [1046790, 707809] processed_samples 3900 unjoint_samples 3900 joint_samples 10 [1046790, 707809] processed_samples 3900 unjoint_samples 3900 joint_samples 12 [1042702, 181960] processed_samples 3900 unjoint_samples 3900 joint_samples 12 [1042702, 181960] processed_samples 3900 unjoint_samples 3900 joint_samples 11 [1045136, 195057] processed_samples 3900 unjoint_samples 3900 joint_samples 11 [1045136, 195057] processed_samples 3900 unjoint_samples 3900 joint_samples 11 [153527, 1039099] processed_samples 3900 unjoint_samples 3900 joint_samples 11 [153527, 1039099] processed_samples 3900 unjoint_samples 3900 joint_samples 11 [500248, 1047330] processed_samples 3900 unjoint_samples 3900 joint_samples 11 [500248, 1047330] processed_samples 3900 unjoint_samples 3900 joint_samples 10 [1046590, 967625] processed_samples 3900 unjoint_samples 3900 joint_samples 10 [1046590, 967625] [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [2024-11-29 03:48:53] iteration 87/ 500 | consumed samples: 696 | elapsed time per iteration (ms): 479164.2 | throughput per GPU (TFLOP/s/GPU): 108.6 | learning rate: 4.738344E-06 | global batch size: 8 | lm loss: 1.037563E+00 | loss scale: 1.0 | grad norm: 1.815 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [2024-11-29 03:57:24] iteration 88/ 500 | consumed samples: 704 | elapsed time per iteration (ms): 510741.3 | throughput per GPU (TFLOP/s/GPU): 101.9 | learning rate: 4.731162E-06 | global batch size: 8 | lm loss: 1.042004E+00 | loss scale: 1.0 | grad norm: 1.369 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1047088, 6440] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1047088, 6440] [2024-11-29 04:06:50] iteration 89/ 500 | consumed samples: 712 | elapsed time per iteration (ms): 566332.4 | throughput per GPU (TFLOP/s/GPU): 91.9 | learning rate: 4.723889E-06 | global batch size: 8 | lm loss: 1.054446E+00 | loss scale: 1.0 | grad norm: 1.186 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 4000 unjoint_samples 4000 joint_samples 11 [830368, 1047330] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [830368, 1047330] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [46408, 1037817] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [46408, 1037817] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1045136, 461770] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [563123, 1046490] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1046590, 171890] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1045136, 461770] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [1046590, 171890] processed_samples 4000 unjoint_samples 4000 joint_samples 12 [1042702, 496263] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [563123, 1046490] processed_samples 4000 unjoint_samples 4000 joint_samples 12 [1042702, 496263] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [458049, 1039099] processed_samples 4000 unjoint_samples 4000 joint_samples 11 [458049, 1039099] [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [2024-11-29 04:14:27] iteration 90/ 500 | consumed samples: 720 | elapsed time per iteration (ms): 457008.2 | throughput per GPU (TFLOP/s/GPU): 113.9 | learning rate: 4.716524E-06 | global batch size: 8 | lm loss: 1.043577E+00 | loss scale: 1.0 | grad norm: 1.171 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1047088, 285789] processed_samples 4100 unjoint_samples 4100 joint_samples 11 [409121, 1037817] processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1046590, 563874] processed_samples 4100 unjoint_samples 4100 joint_samples 12 [993319, 204450] [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure processed_samples 4100 unjoint_samples 4100 joint_samples 11 [777555, 1039099] processed_samples 4100 unjoint_samples 4100 joint_samples 11 [824691, 1046490] processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1045136, 727958] processed_samples 4100 unjoint_samples 4100 joint_samples 12 [1042702, 905179] [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1046590, 563874] processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1047088, 285789] processed_samples 4100 unjoint_samples 4100 joint_samples 11 [409121, 1037817] processed_samples 4100 unjoint_samples 4100 joint_samples 12 [993319, 204450] processed_samples 4100 unjoint_samples 4100 joint_samples 12 [1042702, 905179] processed_samples 4100 unjoint_samples 4100 joint_samples 11 [777555, 1039099] [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 4100 unjoint_samples 4100 joint_samples 11 [824691, 1046490] [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure processed_samples 4100 unjoint_samples 4100 joint_samples 11 [1045136, 727958] [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [2024-11-29 04:24:04] iteration 91/ 500 | consumed samples: 728 | elapsed time per iteration (ms): 576893.4 | throughput per GPU (TFLOP/s/GPU): 90.2 | learning rate: 4.709068E-06 | global batch size: 8 | lm loss: 9.959705E-01 | loss scale: 1.0 | grad norm: 1.052 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 04:31:49] iteration 92/ 500 | consumed samples: 736 | elapsed time per iteration (ms): 464881.9 | throughput per GPU (TFLOP/s/GPU): 111.9 | learning rate: 4.701522E-06 | global batch size: 8 | lm loss: 1.006524E+00 | loss scale: 1.0 | grad norm: 1.068 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [2024-11-29 04:39:19] iteration 93/ 500 | consumed samples: 744 | elapsed time per iteration (ms): 450545.2 | throughput per GPU (TFLOP/s/GPU): 115.5 | learning rate: 4.693886E-06 | global batch size: 8 | lm loss: 9.836991E-01 | loss scale: 1.0 | grad norm: 1.314 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [2024-11-29 04:48:00] iteration 94/ 500 | consumed samples: 752 | elapsed time per iteration (ms): 520814.7 | throughput per GPU (TFLOP/s/GPU): 99.9 | learning rate: 4.686160E-06 | global batch size: 8 | lm loss: 1.038640E+00 | loss scale: 1.0 | grad norm: 1.123 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 4200 unjoint_samples 4200 joint_samples 11 [1047088, 636316] processed_samples 4200 unjoint_samples 4200 joint_samples 13 [1046693, 107270] processed_samples 4200 unjoint_samples 4200 joint_samples 12 [83138, 1010857] processed_samples 4200 unjoint_samples 4200 joint_samples 12 [981983, 307130] processed_samples 4200 unjoint_samples 4200 joint_samples 12 [37300, 1046490] processed_samples 4200 unjoint_samples 4200 joint_samples 11 [1046590, 865156] processed_samples 4200 unjoint_samples 4200 joint_samples 11 [734402, 1037817] processed_samples 4200 unjoint_samples 4200 joint_samples 12 [993319, 475002] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure processed_samples 4200 unjoint_samples 4200 joint_samples 11 [1047088, 636316] processed_samples 4200 unjoint_samples 4200 joint_samples 13 [1046693, 107270] processed_samples 4200 unjoint_samples 4200 joint_samples 12 [83138, 1010857] processed_samples 4200 unjoint_samples 4200 joint_samples 12 [37300, 1046490] processed_samples 4200 unjoint_samples 4200 joint_samples 12 [981983, 307130] processed_samples 4200 unjoint_samples 4200 joint_samples 11 [1046590, 865156] processed_samples 4200 unjoint_samples 4200 joint_samples 11 [734402, 1037817] processed_samples 4200 unjoint_samples 4200 joint_samples 12 [993319, 475002] [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [2024-11-29 04:56:25] iteration 95/ 500 | consumed samples: 760 | elapsed time per iteration (ms): 504330.7 | throughput per GPU (TFLOP/s/GPU): 103.2 | learning rate: 4.678344E-06 | global batch size: 8 | lm loss: 1.033153E+00 | loss scale: 1.0 | grad norm: 1.444 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215e692c80] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure processed_samples 4300 unjoint_samples 4300 joint_samples 11 [1047088, 890480] processed_samples 4300 unjoint_samples 4300 joint_samples 12 [981983, 615910] processed_samples 4300 unjoint_samples 4300 joint_samples 12 [102485, 1046252] processed_samples 4300 unjoint_samples 4300 joint_samples 13 [1046693, 415812] processed_samples 4300 unjoint_samples 4300 joint_samples 11 [1047088, 890480] processed_samples 4300 unjoint_samples 4300 joint_samples 12 [102485, 1046252] processed_samples 4300 unjoint_samples 4300 joint_samples 12 [406857, 1046490] processed_samples 4300 unjoint_samples 4300 joint_samples 12 [412630, 1010857] processed_samples 4300 unjoint_samples 4300 joint_samples 12 [981983, 615910] [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure processed_samples 4300 unjoint_samples 4300 joint_samples 12 [412630, 1010857] processed_samples 4300 unjoint_samples 4300 joint_samples 12 [406857, 1046490] [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure processed_samples 4300 unjoint_samples 4300 joint_samples 12 [993319, 788105] processed_samples 4300 unjoint_samples 4300 joint_samples 13 [1046693, 415812] processed_samples 4300 unjoint_samples 4300 joint_samples 11 [1011278, 1037817] [h264 @ 0x55d514afd580] mmco: unref short failure processed_samples 4300 unjoint_samples 4300 joint_samples 11 [1011278, 1037817] [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure processed_samples 4300 unjoint_samples 4300 joint_samples 12 [993319, 788105] [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [2024-11-29 05:05:55] iteration 96/ 500 | consumed samples: 768 | elapsed time per iteration (ms): 570433.4 | throughput per GPU (TFLOP/s/GPU): 91.2 | learning rate: 4.670439E-06 | global batch size: 8 | lm loss: 1.019382E+00 | loss scale: 1.0 | grad norm: 1.070 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [2024-11-29 05:14:18] iteration 97/ 500 | consumed samples: 776 | elapsed time per iteration (ms): 502991.6 | throughput per GPU (TFLOP/s/GPU): 103.5 | learning rate: 4.662444E-06 | global batch size: 8 | lm loss: 1.011606E+00 | loss scale: 1.0 | grad norm: 1.330 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51396a480] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [2024-11-29 05:22:22] iteration 98/ 500 | consumed samples: 784 | elapsed time per iteration (ms): 484093.2 | throughput per GPU (TFLOP/s/GPU): 107.5 | learning rate: 4.654361E-06 | global batch size: 8 | lm loss: 1.043122E+00 | loss scale: 1.0 | grad norm: 1.471 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure processed_samples 4400 unjoint_samples 4400 joint_samples 12 [1047088, 109545] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [1047088, 109545] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [520419, 1046252] processed_samples 4400 unjoint_samples 4400 joint_samples 13 [1042102, 33067] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [520419, 1046252] processed_samples 4400 unjoint_samples 4400 joint_samples 13 [1042102, 33067] processed_samples 4400 unjoint_samples 4400 joint_samples 13 [1046693, 688866] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [768511, 1046490] processed_samples 4400 unjoint_samples 4400 joint_samples 13 [1046693, 688866] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [1041692, 359584] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [1041692, 359584] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [981983, 886177] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [743694, 1010857] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [743694, 1010857] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [768511, 1046490] processed_samples 4400 unjoint_samples 4400 joint_samples 12 [981983, 886177] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51396a480] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [2024-11-29 05:31:38] iteration 99/ 500 | consumed samples: 792 | elapsed time per iteration (ms): 556395.2 | throughput per GPU (TFLOP/s/GPU): 93.5 | learning rate: 4.646190E-06 | global batch size: 8 | lm loss: 9.925530E-01 | loss scale: 1.0 | grad norm: 1.482 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [2024-11-29 05:39:47] iteration 100/ 500 | consumed samples: 800 | elapsed time per iteration (ms): 488373.0 | throughput per GPU (TFLOP/s/GPU): 106.6 | learning rate: 4.637931E-06 | global batch size: 8 | lm loss: 9.455621E-01 | loss scale: 1.0 | grad norm: 0.972 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (248965.82, 248966.16) processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1036673, 41529] processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1036673, 41529] [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1047088, 454637] processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1047088, 454637] processed_samples 4500 unjoint_samples 4500 joint_samples 13 [10701, 1046490] processed_samples 4500 unjoint_samples 4500 joint_samples 13 [10701, 1046490] processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1042102, 270411] processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1042102, 270411] processed_samples 4500 unjoint_samples 4500 joint_samples 12 [834087, 1046252] processed_samples 4500 unjoint_samples 4500 joint_samples 12 [834087, 1046252] processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1041692, 612687] processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1041692, 612687] [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1046693, 1013490] processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1038438, 1038449] [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 4500 unjoint_samples 4500 joint_samples 12 [1038438, 1038449] [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 4500 unjoint_samples 4500 joint_samples 13 [1046693, 1013490] [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [2024-11-29 05:51:18] iteration 101/ 500 | consumed samples: 808 | elapsed time per iteration (ms): 442254.7 | throughput per GPU (TFLOP/s/GPU): 117.7 | learning rate: 4.629585E-06 | global batch size: 8 | lm loss: 1.065639E+00 | loss scale: 1.0 | grad norm: 1.389 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51396a480] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [2024-11-29 05:59:08] iteration 102/ 500 | consumed samples: 816 | elapsed time per iteration (ms): 470143.8 | throughput per GPU (TFLOP/s/GPU): 110.7 | learning rate: 4.621151E-06 | global batch size: 8 | lm loss: 1.034651E+00 | loss scale: 1.0 | grad norm: 0.955 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [2024-11-29 06:06:19] iteration 103/ 500 | consumed samples: 824 | elapsed time per iteration (ms): 431079.3 | throughput per GPU (TFLOP/s/GPU): 120.7 | learning rate: 4.612630E-06 | global batch size: 8 | lm loss: 1.042905E+00 | loss scale: 1.0 | grad norm: 1.657 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1036673, 353785] [h264 @ 0x562163b30a80] mmco: unref short failure processed_samples 4600 unjoint_samples 4600 joint_samples 12 [1047088, 687956] processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1042766, 104271] processed_samples 4600 unjoint_samples 4600 joint_samples 14 [311681, 1031759] processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1042102, 534994] processed_samples 4600 unjoint_samples 4600 joint_samples 13 [360930, 1045107] processed_samples 4600 unjoint_samples 4600 joint_samples 12 [1041692, 896915] [h264 @ 0x56215babf100] mmco: unref short failure processed_samples 4600 unjoint_samples 4600 joint_samples 13 [257614, 1046490] [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1036673, 353785] [h264 @ 0x55d5137029c0] mmco: unref short failure processed_samples 4600 unjoint_samples 4600 joint_samples 12 [1047088, 687956] processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1042766, 104271] processed_samples 4600 unjoint_samples 4600 joint_samples 13 [257614, 1046490] processed_samples 4600 unjoint_samples 4600 joint_samples 13 [360930, 1045107] processed_samples 4600 unjoint_samples 4600 joint_samples 14 [311681, 1031759] processed_samples 4600 unjoint_samples 4600 joint_samples 13 [1042102, 534994] [h264 @ 0x56215c8ed2c0] mmco: unref short failure processed_samples 4600 unjoint_samples 4600 joint_samples 12 [1041692, 896915] [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5135e2680] mmco: unref short failure [h264 @ 0x55d5135e2680] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [2024-11-29 06:15:23] iteration 104/ 500 | consumed samples: 832 | elapsed time per iteration (ms): 543203.7 | throughput per GPU (TFLOP/s/GPU): 95.8 | learning rate: 4.604022E-06 | global batch size: 8 | lm loss: 1.031887E+00 | loss scale: 1.0 | grad norm: 0.929 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure processed_samples 4700 unjoint_samples 4700 joint_samples 13 [62227, 1003115] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [62227, 1003115] [2024-11-29 06:25:18] iteration 105/ 500 | consumed samples: 840 | elapsed time per iteration (ms): 595132.4 | throughput per GPU (TFLOP/s/GPU): 87.4 | learning rate: 4.595329E-06 | global batch size: 8 | lm loss: 1.009381E+00 | loss scale: 1.0 | grad norm: 1.249 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 4700 unjoint_samples 4700 joint_samples 13 [157102, 1036574] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [157102, 1036574] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1042766, 381816] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [515158, 1046490] processed_samples 4700 unjoint_samples 4700 joint_samples 14 [612607, 1031759] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1042102, 837593] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1042102, 837593] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1042766, 381816] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1036673, 732799] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [576384, 1045107] processed_samples 4700 unjoint_samples 4700 joint_samples 14 [612607, 1031759] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [1036673, 732799] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [576384, 1045107] processed_samples 4700 unjoint_samples 4700 joint_samples 13 [515158, 1046490] [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [2024-11-29 06:34:43] iteration 106/ 500 | consumed samples: 848 | elapsed time per iteration (ms): 565474.5 | throughput per GPU (TFLOP/s/GPU): 92.0 | learning rate: 4.586549E-06 | global batch size: 8 | lm loss: 1.035590E+00 | loss scale: 1.0 | grad norm: 1.114 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure processed_samples 4800 unjoint_samples 4800 joint_samples 13 [842164, 1046490] processed_samples 4800 unjoint_samples 4800 joint_samples 14 [126876, 1033289] processed_samples 4800 unjoint_samples 4800 joint_samples 13 [374110, 1036574] processed_samples 4800 unjoint_samples 4800 joint_samples 13 [1036673, 997318] processed_samples 4800 unjoint_samples 4800 joint_samples 14 [947205, 1031759] processed_samples 4800 unjoint_samples 4800 joint_samples 13 [409213, 1003115] processed_samples 4800 unjoint_samples 4800 joint_samples 13 [1042766, 729396] processed_samples 4800 unjoint_samples 4800 joint_samples 13 [915955, 1045107] [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure processed_samples 4800 unjoint_samples 4800 joint_samples 13 [409213, 1003115] processed_samples 4800 unjoint_samples 4800 joint_samples 13 [374110, 1036574] processed_samples 4800 unjoint_samples 4800 joint_samples 14 [126876, 1033289] processed_samples 4800 unjoint_samples 4800 joint_samples 13 [1042766, 729396] processed_samples 4800 unjoint_samples 4800 joint_samples 13 [1036673, 997318] processed_samples 4800 unjoint_samples 4800 joint_samples 13 [842164, 1046490] processed_samples 4800 unjoint_samples 4800 joint_samples 14 [947205, 1031759] processed_samples 4800 unjoint_samples 4800 joint_samples 13 [915955, 1045107] [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [2024-11-29 06:42:00] iteration 107/ 500 | consumed samples: 856 | elapsed time per iteration (ms): 436982.9 | throughput per GPU (TFLOP/s/GPU): 119.1 | learning rate: 4.577684E-06 | global batch size: 8 | lm loss: 9.436591E-01 | loss scale: 1.0 | grad norm: 0.816 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [2024-11-29 06:51:01] iteration 108/ 500 | consumed samples: 864 | elapsed time per iteration (ms): 540830.1 | throughput per GPU (TFLOP/s/GPU): 96.2 | learning rate: 4.568735E-06 | global batch size: 8 | lm loss: 9.739519E-01 | loss scale: 1.0 | grad norm: 0.957 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [2024-11-29 06:58:44] iteration 109/ 500 | consumed samples: 872 | elapsed time per iteration (ms): 462580.7 | throughput per GPU (TFLOP/s/GPU): 112.5 | learning rate: 4.559700E-06 | global batch size: 8 | lm loss: 1.027283E+00 | loss scale: 1.0 | grad norm: 0.968 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [2024-11-29 07:06:40] iteration 110/ 500 | consumed samples: 880 | elapsed time per iteration (ms): 476241.8 | throughput per GPU (TFLOP/s/GPU): 109.3 | learning rate: 4.550581E-06 | global batch size: 8 | lm loss: 9.752579E-01 | loss scale: 1.0 | grad norm: 1.212 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure processed_samples 4900 unjoint_samples 4900 joint_samples 13 [682640, 1003115] [h264 @ 0x56215b7c4500] mmco: unref short failure processed_samples 4900 unjoint_samples 4900 joint_samples 13 [613951, 1036574] processed_samples 4900 unjoint_samples 4900 joint_samples 14 [59990, 1047184] processed_samples 4900 unjoint_samples 4900 joint_samples 15 [187164, 1039417] processed_samples 4900 unjoint_samples 4900 joint_samples 14 [387453, 1033289] processed_samples 4900 unjoint_samples 4900 joint_samples 14 [1036673, 308324] [h264 @ 0x56215bb86100] mmco: unref short failure processed_samples 4900 unjoint_samples 4900 joint_samples 14 [258805, 1047319] processed_samples 4900 unjoint_samples 4900 joint_samples 13 [1042766, 1022708] [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure processed_samples 4900 unjoint_samples 4900 joint_samples 13 [682640, 1003115] processed_samples 4900 unjoint_samples 4900 joint_samples 14 [59990, 1047184] processed_samples 4900 unjoint_samples 4900 joint_samples 15 [187164, 1039417] processed_samples 4900 unjoint_samples 4900 joint_samples 14 [1036673, 308324] processed_samples 4900 unjoint_samples 4900 joint_samples 14 [387453, 1033289] processed_samples 4900 unjoint_samples 4900 joint_samples 13 [613951, 1036574] processed_samples 4900 unjoint_samples 4900 joint_samples 13 [1042766, 1022708] processed_samples 4900 unjoint_samples 4900 joint_samples 14 [258805, 1047319] [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [2024-11-29 07:13:43] iteration 111/ 500 | consumed samples: 888 | elapsed time per iteration (ms): 423204.8 | throughput per GPU (TFLOP/s/GPU): 123.0 | learning rate: 4.541378E-06 | global batch size: 8 | lm loss: 9.781538E-01 | loss scale: 1.0 | grad norm: 1.067 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215e692c80] mmco: unref short failure [h264 @ 0x56215e692c80] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure processed_samples 5000 unjoint_samples 5000 joint_samples 14 [242265, 1045059] processed_samples 5000 unjoint_samples 5000 joint_samples 14 [430574, 1047184] processed_samples 5000 unjoint_samples 5000 joint_samples 14 [1036673, 661373] processed_samples 5000 unjoint_samples 5000 joint_samples 14 [567324, 1047319] [h264 @ 0x56215f845e80] mmco: unref short failure processed_samples 5000 unjoint_samples 5000 joint_samples 15 [526163, 1039417] processed_samples 5000 unjoint_samples 5000 joint_samples 13 [993636, 1003115] processed_samples 5000 unjoint_samples 5000 joint_samples 14 [639696, 1033289] processed_samples 5000 unjoint_samples 5000 joint_samples 13 [980516, 1036574] [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 5000 unjoint_samples 5000 joint_samples 14 [242265, 1045059] processed_samples 5000 unjoint_samples 5000 joint_samples 14 [430574, 1047184] processed_samples 5000 unjoint_samples 5000 joint_samples 15 [526163, 1039417] processed_samples 5000 unjoint_samples 5000 joint_samples 14 [1036673, 661373] processed_samples 5000 unjoint_samples 5000 joint_samples 14 [639696, 1033289] processed_samples 5000 unjoint_samples 5000 joint_samples 14 [567324, 1047319] [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure processed_samples 5000 unjoint_samples 5000 joint_samples 13 [993636, 1003115] processed_samples 5000 unjoint_samples 5000 joint_samples 13 [980516, 1036574] [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [2024-11-29 07:23:04] iteration 112/ 500 | consumed samples: 896 | elapsed time per iteration (ms): 560984.6 | throughput per GPU (TFLOP/s/GPU): 92.8 | learning rate: 4.532092E-06 | global batch size: 8 | lm loss: 9.570374E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [2024-11-29 07:32:45] iteration 113/ 500 | consumed samples: 904 | elapsed time per iteration (ms): 581159.7 | throughput per GPU (TFLOP/s/GPU): 89.5 | learning rate: 4.522722E-06 | global batch size: 8 | lm loss: 9.289886E-01 | loss scale: 1.0 | grad norm: 0.897 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215c3b4780] mmco: unref short failure [h264 @ 0x56215c3b4780] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215cb38d40] mmco: unref short failure [h264 @ 0x56215cb38d40] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure processed_samples 5100 unjoint_samples 5100 joint_samples 14 [141801, 1046799] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [1007557, 381378] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [626624, 1045059] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [799334, 1047184] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [931628, 1033289] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [141801, 1046799] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [1007557, 381378] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [626624, 1045059] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [799334, 1047184] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [931628, 1033289] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [898833, 1047319] [h264 @ 0x55d513a1f200] mmco: unref short failure processed_samples 5100 unjoint_samples 5100 joint_samples 15 [925759, 1039417] processed_samples 5100 unjoint_samples 5100 joint_samples 15 [925759, 1039417] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [898833, 1047319] processed_samples 5100 unjoint_samples 5100 joint_samples 14 [1036673, 1012305] [h264 @ 0x562160944a00] mmco: unref short failure processed_samples 5100 unjoint_samples 5100 joint_samples 14 [1036673, 1012305] [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [2024-11-29 07:41:28] iteration 114/ 500 | consumed samples: 912 | elapsed time per iteration (ms): 522621.9 | throughput per GPU (TFLOP/s/GPU): 99.6 | learning rate: 4.513270E-06 | global batch size: 8 | lm loss: 9.133291E-01 | loss scale: 1.0 | grad norm: 0.891 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [2024-11-29 07:49:33] iteration 115/ 500 | consumed samples: 920 | elapsed time per iteration (ms): 485532.1 | throughput per GPU (TFLOP/s/GPU): 107.2 | learning rate: 4.503735E-06 | global batch size: 8 | lm loss: 9.276041E-01 | loss scale: 1.0 | grad norm: 0.883 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-11-29 07:59:32] iteration 116/ 500 | consumed samples: 928 | elapsed time per iteration (ms): 598337.5 | throughput per GPU (TFLOP/s/GPU): 87.0 | learning rate: 4.494118E-06 | global batch size: 8 | lm loss: 9.630095E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [2024-11-29 08:08:05] iteration 117/ 500 | consumed samples: 936 | elapsed time per iteration (ms): 513474.3 | throughput per GPU (TFLOP/s/GPU): 101.4 | learning rate: 4.484420E-06 | global batch size: 8 | lm loss: 9.400152E-01 | loss scale: 1.0 | grad norm: 0.982 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure processed_samples 5200 unjoint_samples 5200 joint_samples 15 [1045834, 131862] processed_samples 5200 unjoint_samples 5200 joint_samples 16 [156295, 1046736] processed_samples 5200 unjoint_samples 5200 joint_samples 14 [628817, 1046799] processed_samples 5200 unjoint_samples 5200 joint_samples 15 [1023091, 282688] processed_samples 5200 unjoint_samples 5200 joint_samples 15 [350115, 1046837] processed_samples 5200 unjoint_samples 5200 joint_samples 14 [1007557, 620477] processed_samples 5200 unjoint_samples 5200 joint_samples 14 [1029032, 1047184] processed_samples 5200 unjoint_samples 5200 joint_samples 14 [984714, 1045059] [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure processed_samples 5200 unjoint_samples 5200 joint_samples 14 [628817, 1046799] processed_samples 5200 unjoint_samples 5200 joint_samples 14 [1007557, 620477] processed_samples 5200 unjoint_samples 5200 joint_samples 16 [156295, 1046736] processed_samples 5200 unjoint_samples 5200 joint_samples 15 [1045834, 131862] processed_samples 5200 unjoint_samples 5200 joint_samples 15 [350115, 1046837] processed_samples 5200 unjoint_samples 5200 joint_samples 15 [1023091, 282688] [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 5200 unjoint_samples 5200 joint_samples 14 [1029032, 1047184] [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure processed_samples 5200 unjoint_samples 5200 joint_samples 14 [984714, 1045059] [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [2024-11-29 08:16:09] iteration 118/ 500 | consumed samples: 944 | elapsed time per iteration (ms): 483708.4 | throughput per GPU (TFLOP/s/GPU): 107.6 | learning rate: 4.474640E-06 | global batch size: 8 | lm loss: 9.409871E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [2024-11-29 08:24:50] iteration 119/ 500 | consumed samples: 952 | elapsed time per iteration (ms): 521429.1 | throughput per GPU (TFLOP/s/GPU): 99.8 | learning rate: 4.464780E-06 | global batch size: 8 | lm loss: 9.223815E-01 | loss scale: 1.0 | grad norm: 6.079 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1038128, 261470] processed_samples 5300 unjoint_samples 5300 joint_samples 15 [377930, 1047184] processed_samples 5300 unjoint_samples 5300 joint_samples 15 [699267, 1046837] processed_samples 5300 unjoint_samples 5300 joint_samples 16 [472923, 1046736] processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1023091, 546262] processed_samples 5300 unjoint_samples 5300 joint_samples 14 [1007557, 972932] processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1045834, 474081] processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1038128, 261470] processed_samples 5300 unjoint_samples 5300 joint_samples 16 [472923, 1046736] processed_samples 5300 unjoint_samples 5300 joint_samples 15 [377930, 1047184] processed_samples 5300 unjoint_samples 5300 joint_samples 14 [928214, 1046799] processed_samples 5300 unjoint_samples 5300 joint_samples 14 [928214, 1046799] [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure processed_samples 5300 unjoint_samples 5300 joint_samples 15 [699267, 1046837] processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1045834, 474081] processed_samples 5300 unjoint_samples 5300 joint_samples 14 [1007557, 972932] processed_samples 5300 unjoint_samples 5300 joint_samples 15 [1023091, 546262] [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [2024-11-29 08:34:56] iteration 120/ 500 | consumed samples: 960 | elapsed time per iteration (ms): 605678.0 | throughput per GPU (TFLOP/s/GPU): 85.9 | learning rate: 4.454840E-06 | global batch size: 8 | lm loss: 9.326912E-01 | loss scale: 1.0 | grad norm: 1.200 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (259183.47, 259183.93) [2024-11-29 08:47:16] iteration 121/ 500 | consumed samples: 968 | elapsed time per iteration (ms): 480180.6 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 4.444819E-06 | global batch size: 8 | lm loss: 9.741026E-01 | loss scale: 1.0 | grad norm: 0.813 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1046338, 293973] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [119916, 1047023] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1038128, 508328] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1046338, 293973] processed_samples 5400 unjoint_samples 5400 joint_samples 16 [726574, 1046736] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [119916, 1047023] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1038128, 508328] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [703098, 1047184] processed_samples 5400 unjoint_samples 5400 joint_samples 16 [726574, 1046736] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1023091, 800941] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1023091, 800941] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [703098, 1047184] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1022332, 1046837] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1022332, 1046837] processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1045834, 755518] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 5400 unjoint_samples 5400 joint_samples 15 [1045834, 755518] [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [2024-11-29 08:55:41] iteration 122/ 500 | consumed samples: 976 | elapsed time per iteration (ms): 505286.7 | throughput per GPU (TFLOP/s/GPU): 103.0 | learning rate: 4.434719E-06 | global batch size: 8 | lm loss: 9.632880E-01 | loss scale: 1.0 | grad norm: 0.791 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [2024-11-29 09:04:05] iteration 123/ 500 | consumed samples: 984 | elapsed time per iteration (ms): 504057.0 | throughput per GPU (TFLOP/s/GPU): 103.2 | learning rate: 4.424540E-06 | global batch size: 8 | lm loss: 9.055476E-01 | loss scale: 1.0 | grad norm: 1.148 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 5500 unjoint_samples 5500 joint_samples 15 [349602, 1047023] processed_samples 5500 unjoint_samples 5500 joint_samples 15 [1046338, 531746] processed_samples 5500 unjoint_samples 5500 joint_samples 15 [1038128, 841101] [h264 @ 0x55d514afd580] mmco: unref short failure processed_samples 5500 unjoint_samples 5500 joint_samples 16 [31641, 1029588] processed_samples 5500 unjoint_samples 5500 joint_samples 15 [1046338, 531746] processed_samples 5500 unjoint_samples 5500 joint_samples 15 [349602, 1047023] processed_samples 5500 unjoint_samples 5500 joint_samples 15 [1038128, 841101] processed_samples 5500 unjoint_samples 5500 joint_samples 16 [31641, 1029588] [2024-11-29 09:14:16] iteration 124/ 500 | consumed samples: 992 | elapsed time per iteration (ms): 611534.8 | throughput per GPU (TFLOP/s/GPU): 85.1 | learning rate: 4.414282E-06 | global batch size: 8 | lm loss: 9.379023E-01 | loss scale: 1.0 | grad norm: 0.838 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 5500 unjoint_samples 5500 joint_samples 16 [366600, 1046889] processed_samples 5500 unjoint_samples 5500 joint_samples 15 [952606, 1047184] processed_samples 5500 unjoint_samples 5500 joint_samples 16 [366600, 1046889] processed_samples 5500 unjoint_samples 5500 joint_samples 15 [952606, 1047184] [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure processed_samples 5500 unjoint_samples 5500 joint_samples 17 [977175, 102313] [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure processed_samples 5500 unjoint_samples 5500 joint_samples 17 [977175, 102313] processed_samples 5500 unjoint_samples 5500 joint_samples 16 [1045883, 77171] processed_samples 5500 unjoint_samples 5500 joint_samples 16 [1045883, 77171] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [2024-11-29 09:21:13] iteration 125/ 500 | consumed samples: 1000 | elapsed time per iteration (ms): 416906.3 | throughput per GPU (TFLOP/s/GPU): 124.8 | learning rate: 4.403946E-06 | global batch size: 8 | lm loss: 9.016672E-01 | loss scale: 1.0 | grad norm: 0.697 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [2024-11-29 09:29:09] iteration 126/ 500 | consumed samples: 1008 | elapsed time per iteration (ms): 475206.1 | throughput per GPU (TFLOP/s/GPU): 109.5 | learning rate: 4.393533E-06 | global batch size: 8 | lm loss: 9.279566E-01 | loss scale: 1.0 | grad norm: 0.846 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [2024-11-29 09:37:39] iteration 127/ 500 | consumed samples: 1016 | elapsed time per iteration (ms): 510425.2 | throughput per GPU (TFLOP/s/GPU): 102.0 | learning rate: 4.383042E-06 | global batch size: 8 | lm loss: 9.100517E-01 | loss scale: 1.0 | grad norm: 0.751 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure processed_samples 5600 unjoint_samples 5600 joint_samples 15 [665989, 1047023] processed_samples 5600 unjoint_samples 5600 joint_samples 16 [757814, 1046889] processed_samples 5600 unjoint_samples 5600 joint_samples 16 [1043995, 106898] processed_samples 5600 unjoint_samples 5600 joint_samples 16 [313677, 1029588] processed_samples 5600 unjoint_samples 5600 joint_samples 17 [977175, 340283] processed_samples 5600 unjoint_samples 5600 joint_samples 16 [138025, 1047184] processed_samples 5600 unjoint_samples 5600 joint_samples 15 [665989, 1047023] processed_samples 5600 unjoint_samples 5600 joint_samples 16 [1045883, 398188] processed_samples 5600 unjoint_samples 5600 joint_samples 16 [757814, 1046889] processed_samples 5600 unjoint_samples 5600 joint_samples 16 [1043995, 106898] processed_samples 5600 unjoint_samples 5600 joint_samples 15 [1046338, 898228] processed_samples 5600 unjoint_samples 5600 joint_samples 17 [977175, 340283] processed_samples 5600 unjoint_samples 5600 joint_samples 16 [138025, 1047184] processed_samples 5600 unjoint_samples 5600 joint_samples 16 [313677, 1029588] processed_samples 5600 unjoint_samples 5600 joint_samples 16 [1045883, 398188] processed_samples 5600 unjoint_samples 5600 joint_samples 15 [1046338, 898228] [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [2024-11-29 09:47:04] iteration 128/ 500 | consumed samples: 1024 | elapsed time per iteration (ms): 564469.2 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 4.372474E-06 | global batch size: 8 | lm loss: 9.047135E-01 | loss scale: 1.0 | grad norm: 0.990 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure processed_samples 5700 unjoint_samples 5700 joint_samples 16 [438013, 1047184] processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1046338, 168892] processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1043995, 440108] processed_samples 5700 unjoint_samples 5700 joint_samples 17 [977175, 559449] processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1048118, 1046889] processed_samples 5700 unjoint_samples 5700 joint_samples 16 [610348, 1029588] processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1045883, 910327] processed_samples 5700 unjoint_samples 5700 joint_samples 15 [991968, 1047023] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 5700 unjoint_samples 5700 joint_samples 16 [438013, 1047184] processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1046338, 168892] processed_samples 5700 unjoint_samples 5700 joint_samples 17 [977175, 559449] processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1043995, 440108] processed_samples 5700 unjoint_samples 5700 joint_samples 16 [610348, 1029588] processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1048118, 1046889] processed_samples 5700 unjoint_samples 5700 joint_samples 15 [991968, 1047023] processed_samples 5700 unjoint_samples 5700 joint_samples 16 [1045883, 910327] [2024-11-29 09:56:27] iteration 129/ 500 | consumed samples: 1032 | elapsed time per iteration (ms): 562972.3 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 4.361829E-06 | global batch size: 8 | lm loss: 8.702211E-01 | loss scale: 1.0 | grad norm: 0.896 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [2024-11-29 10:07:13] iteration 130/ 500 | consumed samples: 1040 | elapsed time per iteration (ms): 646197.5 | throughput per GPU (TFLOP/s/GPU): 80.5 | learning rate: 4.351109E-06 | global batch size: 8 | lm loss: 9.481044E-01 | loss scale: 1.0 | grad norm: 0.885 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b928840] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [2024-11-29 10:17:23] iteration 131/ 500 | consumed samples: 1048 | elapsed time per iteration (ms): 609849.6 | throughput per GPU (TFLOP/s/GPU): 85.3 | learning rate: 4.340313E-06 | global batch size: 8 | lm loss: 9.409455E-01 | loss scale: 1.0 | grad norm: 0.983 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure processed_samples 5800 unjoint_samples 5800 joint_samples 16 [1043995, 761563] processed_samples 5800 unjoint_samples 5800 joint_samples 16 [1046338, 489222] processed_samples 5800 unjoint_samples 5800 joint_samples 17 [172071, 1039900] processed_samples 5800 unjoint_samples 5800 joint_samples 16 [208456, 1047023] processed_samples 5800 unjoint_samples 5800 joint_samples 16 [704681, 1047184] processed_samples 5800 unjoint_samples 5800 joint_samples 17 [274278, 1047055] [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure processed_samples 5800 unjoint_samples 5800 joint_samples 17 [977175, 825885] [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 5800 unjoint_samples 5800 joint_samples 16 [882082, 1029588] [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure processed_samples 5800 unjoint_samples 5800 joint_samples 16 [208456, 1047023] processed_samples 5800 unjoint_samples 5800 joint_samples 16 [1043995, 761563] processed_samples 5800 unjoint_samples 5800 joint_samples 16 [1046338, 489222] processed_samples 5800 unjoint_samples 5800 joint_samples 17 [172071, 1039900] processed_samples 5800 unjoint_samples 5800 joint_samples 17 [274278, 1047055] [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure processed_samples 5800 unjoint_samples 5800 joint_samples 16 [704681, 1047184] processed_samples 5800 unjoint_samples 5800 joint_samples 17 [977175, 825885] processed_samples 5800 unjoint_samples 5800 joint_samples 16 [882082, 1029588] [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x56215b931500] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [2024-11-29 10:27:20] iteration 132/ 500 | consumed samples: 1056 | elapsed time per iteration (ms): 597434.6 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 4.329441E-06 | global batch size: 8 | lm loss: 9.020258E-01 | loss scale: 1.0 | grad norm: 0.744 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [2024-11-29 10:35:23] iteration 133/ 500 | consumed samples: 1064 | elapsed time per iteration (ms): 482648.4 | throughput per GPU (TFLOP/s/GPU): 107.8 | learning rate: 4.318496E-06 | global batch size: 8 | lm loss: 8.788584E-01 | loss scale: 1.0 | grad norm: 0.890 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [2024-11-29 10:43:43] iteration 134/ 500 | consumed samples: 1072 | elapsed time per iteration (ms): 499918.4 | throughput per GPU (TFLOP/s/GPU): 104.1 | learning rate: 4.307476E-06 | global batch size: 8 | lm loss: 9.287664E-01 | loss scale: 1.0 | grad norm: 0.724 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215c3b4780] mmco: unref short failure [2024-11-29 10:51:57] iteration 135/ 500 | consumed samples: 1080 | elapsed time per iteration (ms): 493979.2 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 4.296382E-06 | global batch size: 8 | lm loss: 9.228923E-01 | loss scale: 1.0 | grad norm: 0.708 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 5900 unjoint_samples 5900 joint_samples 17 [132904, 1028391] processed_samples 5900 unjoint_samples 5900 joint_samples 17 [1046167, 219972] processed_samples 5900 unjoint_samples 5900 joint_samples 16 [503140, 1047023] processed_samples 5900 unjoint_samples 5900 joint_samples 17 [92033, 1047184] processed_samples 5900 unjoint_samples 5900 joint_samples 16 [1046338, 844199] processed_samples 5900 unjoint_samples 5900 joint_samples 18 [1004769, 103157] processed_samples 5900 unjoint_samples 5900 joint_samples 17 [427988, 1039900] processed_samples 5900 unjoint_samples 5900 joint_samples 17 [541781, 1047055] processed_samples 5900 unjoint_samples 5900 joint_samples 17 [132904, 1028391] processed_samples 5900 unjoint_samples 5900 joint_samples 18 [1004769, 103157] processed_samples 5900 unjoint_samples 5900 joint_samples 17 [92033, 1047184] processed_samples 5900 unjoint_samples 5900 joint_samples 17 [1046167, 219972] processed_samples 5900 unjoint_samples 5900 joint_samples 17 [427988, 1039900] processed_samples 5900 unjoint_samples 5900 joint_samples 16 [503140, 1047023] processed_samples 5900 unjoint_samples 5900 joint_samples 16 [1046338, 844199] processed_samples 5900 unjoint_samples 5900 joint_samples 17 [541781, 1047055] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [2024-11-29 11:00:06] iteration 136/ 500 | consumed samples: 1088 | elapsed time per iteration (ms): 489608.1 | throughput per GPU (TFLOP/s/GPU): 106.3 | learning rate: 4.285215E-06 | global batch size: 8 | lm loss: 9.310520E-01 | loss scale: 1.0 | grad norm: 0.922 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure processed_samples 6000 unjoint_samples 6000 joint_samples 17 [369661, 1028391] processed_samples 6000 unjoint_samples 6000 joint_samples 17 [1046338, 178925] processed_samples 6000 unjoint_samples 6000 joint_samples 17 [1046167, 558366] processed_samples 6000 unjoint_samples 6000 joint_samples 17 [364407, 1047184] [h264 @ 0x562161968b80] mmco: unref short failure processed_samples 6000 unjoint_samples 6000 joint_samples 18 [1004769, 426598] processed_samples 6000 unjoint_samples 6000 joint_samples 17 [783853, 1039900] processed_samples 6000 unjoint_samples 6000 joint_samples 16 [793450, 1047023] [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure processed_samples 6000 unjoint_samples 6000 joint_samples 17 [804873, 1047055] [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 6000 unjoint_samples 6000 joint_samples 17 [369661, 1028391] processed_samples 6000 unjoint_samples 6000 joint_samples 17 [1046338, 178925] processed_samples 6000 unjoint_samples 6000 joint_samples 17 [364407, 1047184] processed_samples 6000 unjoint_samples 6000 joint_samples 17 [1046167, 558366] processed_samples 6000 unjoint_samples 6000 joint_samples 16 [793450, 1047023] processed_samples 6000 unjoint_samples 6000 joint_samples 18 [1004769, 426598] processed_samples 6000 unjoint_samples 6000 joint_samples 17 [783853, 1039900] [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure processed_samples 6000 unjoint_samples 6000 joint_samples 17 [804873, 1047055] [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [2024-11-29 11:10:36] iteration 137/ 500 | consumed samples: 1096 | elapsed time per iteration (ms): 629550.2 | throughput per GPU (TFLOP/s/GPU): 82.7 | learning rate: 4.273975E-06 | global batch size: 8 | lm loss: 8.931444E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [2024-11-29 11:21:13] iteration 138/ 500 | consumed samples: 1104 | elapsed time per iteration (ms): 637652.4 | throughput per GPU (TFLOP/s/GPU): 81.6 | learning rate: 4.262663E-06 | global batch size: 8 | lm loss: 8.654176E-01 | loss scale: 1.0 | grad norm: 0.660 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure processed_samples 6100 unjoint_samples 6100 joint_samples 18 [131317, 1042149] processed_samples 6100 unjoint_samples 6100 joint_samples 18 [1047271, 102335] processed_samples 6100 unjoint_samples 6100 joint_samples 17 [1046338, 527140] processed_samples 6100 unjoint_samples 6100 joint_samples 17 [679324, 1047184] processed_samples 6100 unjoint_samples 6100 joint_samples 18 [1004769, 794060] processed_samples 6100 unjoint_samples 6100 joint_samples 16 [1043235, 1047023] processed_samples 6100 unjoint_samples 6100 joint_samples 17 [628454, 1028391] processed_samples 6100 unjoint_samples 6100 joint_samples 18 [131317, 1042149] processed_samples 6100 unjoint_samples 6100 joint_samples 18 [1047271, 102335] processed_samples 6100 unjoint_samples 6100 joint_samples 17 [628454, 1028391] processed_samples 6100 unjoint_samples 6100 joint_samples 17 [679324, 1047184] processed_samples 6100 unjoint_samples 6100 joint_samples 17 [1046167, 832626] processed_samples 6100 unjoint_samples 6100 joint_samples 17 [1046338, 527140] processed_samples 6100 unjoint_samples 6100 joint_samples 18 [1004769, 794060] processed_samples 6100 unjoint_samples 6100 joint_samples 17 [1046167, 832626] processed_samples 6100 unjoint_samples 6100 joint_samples 16 [1043235, 1047023] [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [2024-11-29 11:31:38] iteration 139/ 500 | consumed samples: 1112 | elapsed time per iteration (ms): 624648.0 | throughput per GPU (TFLOP/s/GPU): 83.3 | learning rate: 4.251279E-06 | global batch size: 8 | lm loss: 9.249836E-01 | loss scale: 1.0 | grad norm: 0.688 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b931500] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d51327b000] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [2024-11-29 11:41:36] iteration 140/ 500 | consumed samples: 1120 | elapsed time per iteration (ms): 597821.7 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 4.239823E-06 | global batch size: 8 | lm loss: 8.981265E-01 | loss scale: 1.0 | grad norm: 0.776 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (257016.91, 257017.27) [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [2024-11-29 11:52:19] iteration 141/ 500 | consumed samples: 1128 | elapsed time per iteration (ms): 386051.8 | throughput per GPU (TFLOP/s/GPU): 134.8 | learning rate: 4.228297E-06 | global batch size: 8 | lm loss: 8.393649E-01 | loss scale: 1.0 | grad norm: 0.650 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure processed_samples 6200 unjoint_samples 6200 joint_samples 17 [320922, 1047023] processed_samples 6200 unjoint_samples 6200 joint_samples 17 [1046338, 759933] processed_samples 6200 unjoint_samples 6200 joint_samples 18 [1046167, 138037] processed_samples 6200 unjoint_samples 6200 joint_samples 19 [1030944, 169552] processed_samples 6200 unjoint_samples 6200 joint_samples 18 [443597, 1042149] processed_samples 6200 unjoint_samples 6200 joint_samples 18 [1047271, 408245] processed_samples 6200 unjoint_samples 6200 joint_samples 17 [883233, 1028391] processed_samples 6200 unjoint_samples 6200 joint_samples 17 [929179, 1047184] [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure processed_samples 6200 unjoint_samples 6200 joint_samples 17 [320922, 1047023] processed_samples 6200 unjoint_samples 6200 joint_samples 17 [883233, 1028391] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 6200 unjoint_samples 6200 joint_samples 18 [1046167, 138037] processed_samples 6200 unjoint_samples 6200 joint_samples 19 [1030944, 169552] processed_samples 6200 unjoint_samples 6200 joint_samples 17 [1046338, 759933] processed_samples 6200 unjoint_samples 6200 joint_samples 18 [443597, 1042149] processed_samples 6200 unjoint_samples 6200 joint_samples 18 [1047271, 408245] [h264 @ 0x562160944a00] mmco: unref short failure processed_samples 6200 unjoint_samples 6200 joint_samples 17 [929179, 1047184] [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [2024-11-29 12:00:47] iteration 142/ 500 | consumed samples: 1136 | elapsed time per iteration (ms): 507722.0 | throughput per GPU (TFLOP/s/GPU): 102.5 | learning rate: 4.216700E-06 | global batch size: 8 | lm loss: 8.758271E-01 | loss scale: 1.0 | grad norm: 0.692 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [2024-11-29 12:09:29] iteration 143/ 500 | consumed samples: 1144 | elapsed time per iteration (ms): 522359.7 | throughput per GPU (TFLOP/s/GPU): 99.6 | learning rate: 4.205033E-06 | global batch size: 8 | lm loss: 9.256617E-01 | loss scale: 1.0 | grad norm: 0.845 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [2024-11-29 12:16:51] iteration 144/ 500 | consumed samples: 1152 | elapsed time per iteration (ms): 441969.7 | throughput per GPU (TFLOP/s/GPU): 117.7 | learning rate: 4.193297E-06 | global batch size: 8 | lm loss: 8.913149E-01 | loss scale: 1.0 | grad norm: 0.687 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure processed_samples 6300 unjoint_samples 6300 joint_samples 18 [53144, 1038729] processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1046650, 282275] processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1046167, 459292] [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 6300 unjoint_samples 6300 joint_samples 18 [770343, 1042149] processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1042680, 77682] processed_samples 6300 unjoint_samples 6300 joint_samples 19 [1030944, 426577] processed_samples 6300 unjoint_samples 6300 joint_samples 17 [704195, 1047023] processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1047271, 714129] [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure processed_samples 6300 unjoint_samples 6300 joint_samples 18 [53144, 1038729] processed_samples 6300 unjoint_samples 6300 joint_samples 18 [770343, 1042149] processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1046167, 459292] processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1042680, 77682] processed_samples 6300 unjoint_samples 6300 joint_samples 19 [1030944, 426577] processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1046650, 282275] processed_samples 6300 unjoint_samples 6300 joint_samples 18 [1047271, 714129] [h264 @ 0x56215b453c80] mmco: unref short failure processed_samples 6300 unjoint_samples 6300 joint_samples 17 [704195, 1047023] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215b5c1b00] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1042680, 296069] [h264 @ 0x56215bb50880] mmco: unref short failure processed_samples 6400 unjoint_samples 6400 joint_samples 18 [400344, 1038729] processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1046167, 731020] processed_samples 6400 unjoint_samples 6400 joint_samples 19 [1030944, 707620] processed_samples 6400 unjoint_samples 6400 joint_samples 17 [1044682, 1047023] processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1041317, 1042149] processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1047271, 1022858] processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1046650, 600778] [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1042680, 296069] processed_samples 6400 unjoint_samples 6400 joint_samples 18 [400344, 1038729] [h264 @ 0x55d51407fb00] mmco: unref short failure processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1041317, 1042149] processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1046167, 731020] processed_samples 6400 unjoint_samples 6400 joint_samples 19 [1030944, 707620] processed_samples 6400 unjoint_samples 6400 joint_samples 17 [1044682, 1047023] processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1046650, 600778] processed_samples 6400 unjoint_samples 6400 joint_samples 18 [1047271, 1022858] [2024-11-29 12:28:10] iteration 145/ 500 | consumed samples: 1160 | elapsed time per iteration (ms): 678831.2 | throughput per GPU (TFLOP/s/GPU): 76.7 | learning rate: 4.181492E-06 | global batch size: 8 | lm loss: 8.918962E-01 | loss scale: 1.0 | grad norm: 0.827 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [2024-11-29 12:39:25] iteration 146/ 500 | consumed samples: 1168 | elapsed time per iteration (ms): 675348.5 | throughput per GPU (TFLOP/s/GPU): 77.1 | learning rate: 4.169619E-06 | global batch size: 8 | lm loss: 8.583605E-01 | loss scale: 1.0 | grad norm: 0.592 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [2024-11-29 12:49:16] iteration 147/ 500 | consumed samples: 1176 | elapsed time per iteration (ms): 590427.3 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 4.157677E-06 | global batch size: 8 | lm loss: 8.400425E-01 | loss scale: 1.0 | grad norm: 0.892 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure processed_samples 6500 unjoint_samples 6500 joint_samples 18 [845034, 1038729] processed_samples 6500 unjoint_samples 6500 joint_samples 18 [280065, 1047023] processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1042680, 632969] processed_samples 6500 unjoint_samples 6500 joint_samples 19 [343590, 1048229] processed_samples 6500 unjoint_samples 6500 joint_samples 19 [252091, 1037762] processed_samples 6500 unjoint_samples 6500 joint_samples 19 [1030944, 1019548] processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1046650, 946021] processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1046167, 998844] [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 6500 unjoint_samples 6500 joint_samples 18 [280065, 1047023] processed_samples 6500 unjoint_samples 6500 joint_samples 19 [252091, 1037762] processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1042680, 632969] processed_samples 6500 unjoint_samples 6500 joint_samples 18 [845034, 1038729] processed_samples 6500 unjoint_samples 6500 joint_samples 19 [343590, 1048229] processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1046650, 946021] processed_samples 6500 unjoint_samples 6500 joint_samples 19 [1030944, 1019548] processed_samples 6500 unjoint_samples 6500 joint_samples 18 [1046167, 998844] [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [2024-11-29 12:59:13] iteration 148/ 500 | consumed samples: 1184 | elapsed time per iteration (ms): 597451.7 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 4.145668E-06 | global batch size: 8 | lm loss: 8.973814E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 13:07:27] iteration 149/ 500 | consumed samples: 1192 | elapsed time per iteration (ms): 493708.6 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 4.133592E-06 | global batch size: 8 | lm loss: 9.514019E-01 | loss scale: 1.0 | grad norm: 0.727 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [2024-11-29 13:16:29] iteration 150/ 500 | consumed samples: 1200 | elapsed time per iteration (ms): 541637.5 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 4.121450E-06 | global batch size: 8 | lm loss: 9.203215E-01 | loss scale: 1.0 | grad norm: 0.633 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [2024-11-29 13:23:20] iteration 151/ 500 | consumed samples: 1208 | elapsed time per iteration (ms): 411185.1 | throughput per GPU (TFLOP/s/GPU): 126.6 | learning rate: 4.109242E-06 | global batch size: 8 | lm loss: 8.703743E-01 | loss scale: 1.0 | grad norm: 0.635 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [2024-11-29 13:31:53] iteration 152/ 500 | consumed samples: 1216 | elapsed time per iteration (ms): 513393.7 | throughput per GPU (TFLOP/s/GPU): 101.4 | learning rate: 4.096968E-06 | global batch size: 8 | lm loss: 9.042392E-01 | loss scale: 1.0 | grad norm: 0.720 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 6600 unjoint_samples 6600 joint_samples 19 [1027384, 113895] processed_samples 6600 unjoint_samples 6600 joint_samples 19 [1027384, 113895] processed_samples 6600 unjoint_samples 6600 joint_samples 19 [37886, 1044635] processed_samples 6600 unjoint_samples 6600 joint_samples 19 [37886, 1044635] processed_samples 6600 unjoint_samples 6600 joint_samples 19 [1046650, 145477] processed_samples 6600 unjoint_samples 6600 joint_samples 19 [1046650, 145477] processed_samples 6600 unjoint_samples 6600 joint_samples 18 [668607, 1047023] processed_samples 6600 unjoint_samples 6600 joint_samples 18 [668607, 1047023] processed_samples 6600 unjoint_samples 6600 joint_samples 19 [321006, 1030705] processed_samples 6600 unjoint_samples 6600 joint_samples 19 [321006, 1030705] processed_samples 6600 unjoint_samples 6600 joint_samples 20 [1031402, 448483] processed_samples 6600 unjoint_samples 6600 joint_samples 20 [1031402, 448483] processed_samples 6600 unjoint_samples 6600 joint_samples 19 [549560, 1037762] processed_samples 6600 unjoint_samples 6600 joint_samples 19 [549560, 1037762] [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure processed_samples 6600 unjoint_samples 6600 joint_samples 19 [673000, 1048229] processed_samples 6600 unjoint_samples 6600 joint_samples 19 [673000, 1048229] [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [2024-11-29 13:41:29] iteration 153/ 500 | consumed samples: 1224 | elapsed time per iteration (ms): 575874.7 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 4.084630E-06 | global batch size: 8 | lm loss: 9.082218E-01 | loss scale: 1.0 | grad norm: 0.646 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 6700 unjoint_samples 6700 joint_samples 19 [337863, 1044635] processed_samples 6700 unjoint_samples 6700 joint_samples 19 [1027384, 466710] processed_samples 6700 unjoint_samples 6700 joint_samples 20 [1046189, 10876] processed_samples 6700 unjoint_samples 6700 joint_samples 20 [1046189, 10876] [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure processed_samples 6700 unjoint_samples 6700 joint_samples 18 [970558, 1047023] processed_samples 6700 unjoint_samples 6700 joint_samples 19 [337863, 1044635] processed_samples 6700 unjoint_samples 6700 joint_samples 19 [1046650, 542888] processed_samples 6700 unjoint_samples 6700 joint_samples 20 [1031402, 787895] processed_samples 6700 unjoint_samples 6700 joint_samples 19 [1027384, 466710] processed_samples 6700 unjoint_samples 6700 joint_samples 19 [568568, 1030705] processed_samples 6700 unjoint_samples 6700 joint_samples 19 [1046650, 542888] processed_samples 6700 unjoint_samples 6700 joint_samples 20 [1031402, 787895] processed_samples 6700 unjoint_samples 6700 joint_samples 19 [568568, 1030705] processed_samples 6700 unjoint_samples 6700 joint_samples 18 [970558, 1047023] processed_samples 6700 unjoint_samples 6700 joint_samples 19 [826032, 1037762] processed_samples 6700 unjoint_samples 6700 joint_samples 19 [826032, 1037762] [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [2024-11-29 13:51:03] iteration 154/ 500 | consumed samples: 1232 | elapsed time per iteration (ms): 573469.6 | throughput per GPU (TFLOP/s/GPU): 90.7 | learning rate: 4.072227E-06 | global batch size: 8 | lm loss: 8.581865E-01 | loss scale: 1.0 | grad norm: 0.790 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [2024-11-29 14:00:09] iteration 155/ 500 | consumed samples: 1240 | elapsed time per iteration (ms): 546612.5 | throughput per GPU (TFLOP/s/GPU): 95.2 | learning rate: 4.059760E-06 | global batch size: 8 | lm loss: 9.444237E-01 | loss scale: 1.0 | grad norm: 0.818 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [2024-11-29 14:09:55] iteration 156/ 500 | consumed samples: 1248 | elapsed time per iteration (ms): 585879.6 | throughput per GPU (TFLOP/s/GPU): 88.8 | learning rate: 4.047230E-06 | global batch size: 8 | lm loss: 9.171274E-01 | loss scale: 1.0 | grad norm: 0.725 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure processed_samples 6800 unjoint_samples 6800 joint_samples 19 [271761, 1047023] processed_samples 6800 unjoint_samples 6800 joint_samples 19 [271761, 1047023] processed_samples 6800 unjoint_samples 6800 joint_samples 19 [1027384, 781072] processed_samples 6800 unjoint_samples 6800 joint_samples 19 [1027384, 781072] processed_samples 6800 unjoint_samples 6800 joint_samples 20 [1048170, 25513] processed_samples 6800 unjoint_samples 6800 joint_samples 20 [1046189, 348499] processed_samples 6800 unjoint_samples 6800 joint_samples 20 [1048170, 25513] processed_samples 6800 unjoint_samples 6800 joint_samples 20 [1046189, 348499] processed_samples 6800 unjoint_samples 6800 joint_samples 21 [109090, 1016327] processed_samples 6800 unjoint_samples 6800 joint_samples 19 [934686, 1030705] processed_samples 6800 unjoint_samples 6800 joint_samples 19 [1046650, 876889] processed_samples 6800 unjoint_samples 6800 joint_samples 21 [109090, 1016327] processed_samples 6800 unjoint_samples 6800 joint_samples 19 [1046650, 876889] processed_samples 6800 unjoint_samples 6800 joint_samples 19 [647843, 1044635] processed_samples 6800 unjoint_samples 6800 joint_samples 19 [647843, 1044635] processed_samples 6800 unjoint_samples 6800 joint_samples 19 [934686, 1030705] [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [2024-11-29 14:17:43] iteration 157/ 500 | consumed samples: 1256 | elapsed time per iteration (ms): 467819.2 | throughput per GPU (TFLOP/s/GPU): 111.2 | learning rate: 4.034637E-06 | global batch size: 8 | lm loss: 8.549634E-01 | loss scale: 1.0 | grad norm: 0.703 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [2024-11-29 14:26:39] iteration 158/ 500 | consumed samples: 1264 | elapsed time per iteration (ms): 535964.1 | throughput per GPU (TFLOP/s/GPU): 97.1 | learning rate: 4.021981E-06 | global batch size: 8 | lm loss: 8.785049E-01 | loss scale: 1.0 | grad norm: 1.639 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [2024-11-29 14:33:25] iteration 159/ 500 | consumed samples: 1272 | elapsed time per iteration (ms): 406479.1 | throughput per GPU (TFLOP/s/GPU): 128.0 | learning rate: 4.009264E-06 | global batch size: 8 | lm loss: 8.546377E-01 | loss scale: 1.0 | grad norm: 0.659 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 6900 unjoint_samples 6900 joint_samples 20 [12617, 1047814] processed_samples 6900 unjoint_samples 6900 joint_samples 20 [12617, 1047814] processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1022264, 234298] processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1022264, 234298] processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1046650, 126573] processed_samples 6900 unjoint_samples 6900 joint_samples 19 [627764, 1047023] processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1046650, 126573] processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1048170, 520650] processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1048170, 520650] processed_samples 6900 unjoint_samples 6900 joint_samples 19 [627764, 1047023] processed_samples 6900 unjoint_samples 6900 joint_samples 21 [410801, 1016327] processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1046189, 859306] processed_samples 6900 unjoint_samples 6900 joint_samples 19 [1042702, 1044635] processed_samples 6900 unjoint_samples 6900 joint_samples 19 [1042702, 1044635] processed_samples 6900 unjoint_samples 6900 joint_samples 21 [410801, 1016327] processed_samples 6900 unjoint_samples 6900 joint_samples 20 [1046189, 859306] [2024-11-29 14:40:53] iteration 160/ 500 | consumed samples: 1280 | elapsed time per iteration (ms): 447637.9 | throughput per GPU (TFLOP/s/GPU): 116.3 | learning rate: 3.996486E-06 | global batch size: 8 | lm loss: 8.583610E-01 | loss scale: 1.0 | grad norm: 0.606 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (126718.18, 126718.44) [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [2024-11-29 14:51:07] iteration 161/ 500 | consumed samples: 1288 | elapsed time per iteration (ms): 487365.2 | throughput per GPU (TFLOP/s/GPU): 106.8 | learning rate: 3.983647E-06 | global batch size: 8 | lm loss: 8.265705E-01 | loss scale: 1.0 | grad norm: 1.024 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1022264, 507371] processed_samples 7000 unjoint_samples 7000 joint_samples 20 [226430, 1047106] processed_samples 7000 unjoint_samples 7000 joint_samples 20 [289600, 1047814] processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1046650, 487061] [h264 @ 0x56215c37f280] mmco: unref short failure processed_samples 7000 unjoint_samples 7000 joint_samples 21 [93852, 1042906] processed_samples 7000 unjoint_samples 7000 joint_samples 19 [898532, 1047023] processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1048170, 811652] [h264 @ 0x56215ba1fac0] mmco: unref short failure processed_samples 7000 unjoint_samples 7000 joint_samples 21 [733359, 1016327] processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1022264, 507371] processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1046650, 487061] processed_samples 7000 unjoint_samples 7000 joint_samples 20 [226430, 1047106] processed_samples 7000 unjoint_samples 7000 joint_samples 20 [289600, 1047814] processed_samples 7000 unjoint_samples 7000 joint_samples 19 [898532, 1047023] [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure processed_samples 7000 unjoint_samples 7000 joint_samples 20 [1048170, 811652] processed_samples 7000 unjoint_samples 7000 joint_samples 21 [93852, 1042906] processed_samples 7000 unjoint_samples 7000 joint_samples 21 [733359, 1016327] [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b931500] mmco: unref short failure [h264 @ 0x56215b931500] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [2024-11-29 15:04:43] iteration 162/ 500 | consumed samples: 1296 | elapsed time per iteration (ms): 815307.1 | throughput per GPU (TFLOP/s/GPU): 63.8 | learning rate: 3.970748E-06 | global batch size: 8 | lm loss: 8.723611E-01 | loss scale: 1.0 | grad norm: 0.678 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [2024-11-29 15:14:26] iteration 163/ 500 | consumed samples: 1304 | elapsed time per iteration (ms): 583954.2 | throughput per GPU (TFLOP/s/GPU): 89.1 | learning rate: 3.957789E-06 | global batch size: 8 | lm loss: 8.921217E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513966480] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 7100 unjoint_samples 7100 joint_samples 20 [648703, 1047814] processed_samples 7100 unjoint_samples 7100 joint_samples 20 [560809, 1047106] processed_samples 7100 unjoint_samples 7100 joint_samples 20 [1046650, 741356] [h264 @ 0x562160944a00] mmco: unref short failure processed_samples 7100 unjoint_samples 7100 joint_samples 21 [88917, 1047729] processed_samples 7100 unjoint_samples 7100 joint_samples 20 [1046650, 741356] processed_samples 7100 unjoint_samples 7100 joint_samples 21 [1027960, 1025959] processed_samples 7100 unjoint_samples 7100 joint_samples 21 [88917, 1047729] processed_samples 7100 unjoint_samples 7100 joint_samples 20 [648703, 1047814] processed_samples 7100 unjoint_samples 7100 joint_samples 21 [449883, 1042906] processed_samples 7100 unjoint_samples 7100 joint_samples 20 [560809, 1047106] processed_samples 7100 unjoint_samples 7100 joint_samples 20 [136766, 1047023] processed_samples 7100 unjoint_samples 7100 joint_samples 20 [1022264, 887134] processed_samples 7100 unjoint_samples 7100 joint_samples 20 [1022264, 887134] processed_samples 7100 unjoint_samples 7100 joint_samples 20 [136766, 1047023] processed_samples 7100 unjoint_samples 7100 joint_samples 21 [449883, 1042906] [h264 @ 0x55d513765580] mmco: unref short failure processed_samples 7100 unjoint_samples 7100 joint_samples 21 [1027960, 1025959] [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [2024-11-29 15:26:40] iteration 164/ 500 | consumed samples: 1312 | elapsed time per iteration (ms): 733709.8 | throughput per GPU (TFLOP/s/GPU): 70.9 | learning rate: 3.944771E-06 | global batch size: 8 | lm loss: 8.858916E-01 | loss scale: 1.0 | grad norm: 0.848 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [2024-11-29 15:34:38] iteration 165/ 500 | consumed samples: 1320 | elapsed time per iteration (ms): 478158.0 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 3.931695E-06 | global batch size: 8 | lm loss: 9.041308E-01 | loss scale: 1.0 | grad norm: 1.039 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215de57ac0] mmco: unref short failure [h264 @ 0x56215de57ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215de57ac0] mmco: unref short failure [h264 @ 0x56215de57ac0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b931500] mmco: unref short failure processed_samples 7200 unjoint_samples 7200 joint_samples 21 [117692, 1046949] processed_samples 7200 unjoint_samples 7200 joint_samples 22 [392276, 1047241] processed_samples 7200 unjoint_samples 7200 joint_samples 20 [421087, 1047023] processed_samples 7200 unjoint_samples 7200 joint_samples 20 [862925, 1047106] processed_samples 7200 unjoint_samples 7200 joint_samples 21 [420804, 1047729] processed_samples 7200 unjoint_samples 7200 joint_samples 20 [929831, 1047814] [h264 @ 0x55d51713dd00] mmco: unref short failure processed_samples 7200 unjoint_samples 7200 joint_samples 20 [1046650, 1013162] processed_samples 7200 unjoint_samples 7200 joint_samples 21 [117692, 1046949] processed_samples 7200 unjoint_samples 7200 joint_samples 20 [421087, 1047023] processed_samples 7200 unjoint_samples 7200 joint_samples 22 [392276, 1047241] processed_samples 7200 unjoint_samples 7200 joint_samples 21 [755432, 1042906] processed_samples 7200 unjoint_samples 7200 joint_samples 20 [862925, 1047106] processed_samples 7200 unjoint_samples 7200 joint_samples 21 [420804, 1047729] processed_samples 7200 unjoint_samples 7200 joint_samples 21 [755432, 1042906] processed_samples 7200 unjoint_samples 7200 joint_samples 20 [1046650, 1013162] processed_samples 7200 unjoint_samples 7200 joint_samples 20 [929831, 1047814] [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [2024-11-29 15:43:43] iteration 166/ 500 | consumed samples: 1328 | elapsed time per iteration (ms): 544539.0 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 3.918560E-06 | global batch size: 8 | lm loss: 8.623494E-01 | loss scale: 1.0 | grad norm: 0.699 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [2024-11-29 15:53:27] iteration 167/ 500 | consumed samples: 1336 | elapsed time per iteration (ms): 584550.5 | throughput per GPU (TFLOP/s/GPU): 89.0 | learning rate: 3.905369E-06 | global batch size: 8 | lm loss: 9.145085E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [2024-11-29 16:00:34] iteration 168/ 500 | consumed samples: 1344 | elapsed time per iteration (ms): 426277.2 | throughput per GPU (TFLOP/s/GPU): 122.1 | learning rate: 3.892120E-06 | global batch size: 8 | lm loss: 8.464835E-01 | loss scale: 1.0 | grad norm: 0.658 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215ba43980] mmco: unref short failure [h264 @ 0x56215ba43980] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [2024-11-29 16:09:00] iteration 169/ 500 | consumed samples: 1352 | elapsed time per iteration (ms): 506237.6 | throughput per GPU (TFLOP/s/GPU): 102.8 | learning rate: 3.878815E-06 | global batch size: 8 | lm loss: 8.497138E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 7300 unjoint_samples 7300 joint_samples 21 [1047065, 216542] processed_samples 7300 unjoint_samples 7300 joint_samples 21 [1043644, 160378] processed_samples 7300 unjoint_samples 7300 joint_samples 21 [1047065, 216542] processed_samples 7300 unjoint_samples 7300 joint_samples 21 [1043644, 160378] processed_samples 7300 unjoint_samples 7300 joint_samples 20 [753653, 1047023] processed_samples 7300 unjoint_samples 7300 joint_samples 22 [1047180, 11000] processed_samples 7300 unjoint_samples 7300 joint_samples 22 [1047180, 11000] processed_samples 7300 unjoint_samples 7300 joint_samples 20 [753653, 1047023] processed_samples 7300 unjoint_samples 7300 joint_samples 21 [343501, 1046704] processed_samples 7300 unjoint_samples 7300 joint_samples 21 [343501, 1046704] processed_samples 7300 unjoint_samples 7300 joint_samples 22 [687319, 1047241] processed_samples 7300 unjoint_samples 7300 joint_samples 22 [687319, 1047241] processed_samples 7300 unjoint_samples 7300 joint_samples 21 [532855, 1046949] processed_samples 7300 unjoint_samples 7300 joint_samples 21 [532855, 1046949] processed_samples 7300 unjoint_samples 7300 joint_samples 21 [704456, 1047729] processed_samples 7300 unjoint_samples 7300 joint_samples 21 [704456, 1047729] [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x56215ba43980] mmco: unref short failure [h264 @ 0x56215ba43980] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215b931500] mmco: unref short failure [h264 @ 0x56215b931500] mmco: unref short failure [h264 @ 0x56215b931500] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1026663, 31663] processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1026663, 31663] [2024-11-29 16:18:57] iteration 170/ 500 | consumed samples: 1360 | elapsed time per iteration (ms): 597046.4 | throughput per GPU (TFLOP/s/GPU): 87.2 | learning rate: 3.865454E-06 | global batch size: 8 | lm loss: 9.044704E-01 | loss scale: 1.0 | grad norm: 1.100 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 7400 unjoint_samples 7400 joint_samples 21 [967174, 1047729] processed_samples 7400 unjoint_samples 7400 joint_samples 21 [967174, 1047729] processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1047065, 497602] processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1047065, 497602] processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1043644, 528225] processed_samples 7400 unjoint_samples 7400 joint_samples 21 [776663, 1046949] processed_samples 7400 unjoint_samples 7400 joint_samples 21 [557112, 1046704] processed_samples 7400 unjoint_samples 7400 joint_samples 22 [965373, 1047241] processed_samples 7400 unjoint_samples 7400 joint_samples 22 [1047180, 332698] processed_samples 7400 unjoint_samples 7400 joint_samples 21 [1043644, 528225] processed_samples 7400 unjoint_samples 7400 joint_samples 21 [776663, 1046949] processed_samples 7400 unjoint_samples 7400 joint_samples 21 [557112, 1046704] processed_samples 7400 unjoint_samples 7400 joint_samples 22 [1047180, 332698] processed_samples 7400 unjoint_samples 7400 joint_samples 22 [965373, 1047241] [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [2024-11-29 16:29:47] iteration 171/ 500 | consumed samples: 1368 | elapsed time per iteration (ms): 650220.2 | throughput per GPU (TFLOP/s/GPU): 80.0 | learning rate: 3.852039E-06 | global batch size: 8 | lm loss: 8.603762E-01 | loss scale: 1.0 | grad norm: 0.745 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [2024-11-29 16:40:14] iteration 172/ 500 | consumed samples: 1376 | elapsed time per iteration (ms): 627029.4 | throughput per GPU (TFLOP/s/GPU): 83.0 | learning rate: 3.838568E-06 | global batch size: 8 | lm loss: 8.916647E-01 | loss scale: 1.0 | grad norm: 0.893 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1026663, 273522] processed_samples 7500 unjoint_samples 7500 joint_samples 23 [229957, 1047241] processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1047065, 862282] processed_samples 7500 unjoint_samples 7500 joint_samples 22 [1041766, 199325] processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1043644, 821705] processed_samples 7500 unjoint_samples 7500 joint_samples 22 [1047180, 651690] [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure processed_samples 7500 unjoint_samples 7500 joint_samples 21 [847600, 1046704] processed_samples 7500 unjoint_samples 7500 joint_samples 23 [229957, 1047241] processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1037382, 1046949] processed_samples 7500 unjoint_samples 7500 joint_samples 22 [1041766, 199325] processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1026663, 273522] [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1047065, 862282] processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1043644, 821705] processed_samples 7500 unjoint_samples 7500 joint_samples 22 [1047180, 651690] [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure processed_samples 7500 unjoint_samples 7500 joint_samples 21 [1037382, 1046949] processed_samples 7500 unjoint_samples 7500 joint_samples 21 [847600, 1046704] [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [2024-11-29 16:48:54] iteration 173/ 500 | consumed samples: 1384 | elapsed time per iteration (ms): 519428.4 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 3.825043E-06 | global batch size: 8 | lm loss: 8.812805E-01 | loss scale: 1.0 | grad norm: 0.881 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [2024-11-29 16:58:28] iteration 174/ 500 | consumed samples: 1392 | elapsed time per iteration (ms): 573912.8 | throughput per GPU (TFLOP/s/GPU): 90.7 | learning rate: 3.811465E-06 | global batch size: 8 | lm loss: 8.597746E-01 | loss scale: 1.0 | grad norm: 0.601 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [2024-11-29 17:07:40] iteration 175/ 500 | consumed samples: 1400 | elapsed time per iteration (ms): 552315.1 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 3.797834E-06 | global batch size: 8 | lm loss: 9.125093E-01 | loss scale: 1.0 | grad norm: 0.722 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [2024-11-29 17:15:39] iteration 176/ 500 | consumed samples: 1408 | elapsed time per iteration (ms): 479317.1 | throughput per GPU (TFLOP/s/GPU): 108.6 | learning rate: 3.784151E-06 | global batch size: 8 | lm loss: 8.529758E-01 | loss scale: 1.0 | grad norm: 1.067 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x562161d4c640] mmco: unref short failure [h264 @ 0x562161d4c640] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure processed_samples 7600 unjoint_samples 7600 joint_samples 22 [190869, 997682] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1036661, 193993] processed_samples 7600 unjoint_samples 7600 joint_samples 23 [492675, 1047241] processed_samples 7600 unjoint_samples 7600 joint_samples 21 [1026663, 545583] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [403092, 1046949] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1041766, 541558] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1047180, 953300] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1045878, 245108] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [190869, 997682] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1036661, 193993] processed_samples 7600 unjoint_samples 7600 joint_samples 23 [492675, 1047241] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1041766, 541558] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1047180, 953300] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [403092, 1046949] processed_samples 7600 unjoint_samples 7600 joint_samples 22 [1045878, 245108] processed_samples 7600 unjoint_samples 7600 joint_samples 21 [1026663, 545583] [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [2024-11-29 17:24:31] iteration 177/ 500 | consumed samples: 1416 | elapsed time per iteration (ms): 531521.6 | throughput per GPU (TFLOP/s/GPU): 97.9 | learning rate: 3.770416E-06 | global batch size: 8 | lm loss: 9.014975E-01 | loss scale: 1.0 | grad norm: 0.680 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215ba43980] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 7700 unjoint_samples 7700 joint_samples 23 [228736, 1044116] processed_samples 7700 unjoint_samples 7700 joint_samples 22 [451395, 997682] processed_samples 7700 unjoint_samples 7700 joint_samples 22 [702166, 1046949] processed_samples 7700 unjoint_samples 7700 joint_samples 23 [791887, 1047241] processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1041766, 881148] [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1036661, 501386] processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1045878, 578684] [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure processed_samples 7700 unjoint_samples 7700 joint_samples 21 [1026663, 814539] [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure processed_samples 7700 unjoint_samples 7700 joint_samples 23 [228736, 1044116] processed_samples 7700 unjoint_samples 7700 joint_samples 22 [702166, 1046949] processed_samples 7700 unjoint_samples 7700 joint_samples 22 [451395, 997682] processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1041766, 881148] [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1045878, 578684] processed_samples 7700 unjoint_samples 7700 joint_samples 22 [1036661, 501386] processed_samples 7700 unjoint_samples 7700 joint_samples 23 [791887, 1047241] processed_samples 7700 unjoint_samples 7700 joint_samples 21 [1026663, 814539] [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [2024-11-29 17:35:52] iteration 178/ 500 | consumed samples: 1424 | elapsed time per iteration (ms): 680769.6 | throughput per GPU (TFLOP/s/GPU): 76.4 | learning rate: 3.756630E-06 | global batch size: 8 | lm loss: 7.555795E-01 | loss scale: 1.0 | grad norm: 0.551 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure processed_samples 7800 unjoint_samples 7800 joint_samples 22 [809672, 997682] processed_samples 7800 unjoint_samples 7800 joint_samples 23 [1047475, 90400] processed_samples 7800 unjoint_samples 7800 joint_samples 23 [553619, 1044116] processed_samples 7800 unjoint_samples 7800 joint_samples 22 [1036661, 886667] processed_samples 7800 unjoint_samples 7800 joint_samples 21 [1040458, 1040562] processed_samples 7800 unjoint_samples 7800 joint_samples 22 [1045878, 857994] processed_samples 7800 unjoint_samples 7800 joint_samples 23 [1046959, 1047241] processed_samples 7800 unjoint_samples 7800 joint_samples 22 [987884, 1046949] [h264 @ 0x55d516907ac0] mmco: unref short failure processed_samples 7800 unjoint_samples 7800 joint_samples 23 [1047475, 90400] processed_samples 7800 unjoint_samples 7800 joint_samples 23 [553619, 1044116] processed_samples 7800 unjoint_samples 7800 joint_samples 22 [809672, 997682] processed_samples 7800 unjoint_samples 7800 joint_samples 21 [1040458, 1040562] processed_samples 7800 unjoint_samples 7800 joint_samples 22 [1036661, 886667] processed_samples 7800 unjoint_samples 7800 joint_samples 22 [1045878, 857994] processed_samples 7800 unjoint_samples 7800 joint_samples 22 [987884, 1046949] processed_samples 7800 unjoint_samples 7800 joint_samples 23 [1046959, 1047241] [h264 @ 0x56215ccdcdc0] mmco: unref short failure [2024-11-29 17:45:43] iteration 179/ 500 | consumed samples: 1432 | elapsed time per iteration (ms): 591570.5 | throughput per GPU (TFLOP/s/GPU): 88.0 | learning rate: 3.742793E-06 | global batch size: 8 | lm loss: 9.391325E-01 | loss scale: 1.0 | grad norm: 0.793 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [2024-11-29 17:57:18] iteration 180/ 500 | consumed samples: 1440 | elapsed time per iteration (ms): 694943.9 | throughput per GPU (TFLOP/s/GPU): 74.9 | learning rate: 3.728906E-06 | global batch size: 8 | lm loss: 8.526995E-01 | loss scale: 1.0 | grad norm: 0.606 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (256118.44, 256118.80) [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [2024-11-29 18:10:37] iteration 181/ 500 | consumed samples: 1448 | elapsed time per iteration (ms): 542974.2 | throughput per GPU (TFLOP/s/GPU): 95.8 | learning rate: 3.714969E-06 | global batch size: 8 | lm loss: 9.044622E-01 | loss scale: 1.0 | grad norm: 0.630 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [2024-11-29 18:18:57] iteration 182/ 500 | consumed samples: 1456 | elapsed time per iteration (ms): 499317.5 | throughput per GPU (TFLOP/s/GPU): 104.2 | learning rate: 3.700984E-06 | global batch size: 8 | lm loss: 8.581277E-01 | loss scale: 1.0 | grad norm: 0.868 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 18:28:16] iteration 183/ 500 | consumed samples: 1464 | elapsed time per iteration (ms): 559222.9 | throughput per GPU (TFLOP/s/GPU): 93.1 | learning rate: 3.686950E-06 | global batch size: 8 | lm loss: 8.661172E-01 | loss scale: 1.0 | grad norm: 0.658 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [2024-11-29 18:37:05] iteration 184/ 500 | consumed samples: 1472 | elapsed time per iteration (ms): 529442.2 | throughput per GPU (TFLOP/s/GPU): 98.3 | learning rate: 3.672869E-06 | global batch size: 8 | lm loss: 8.653899E-01 | loss scale: 1.0 | grad norm: 0.660 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure processed_samples 7900 unjoint_samples 7900 joint_samples 22 [1047361, 270413] processed_samples 7900 unjoint_samples 7900 joint_samples 23 [153090, 1038342] processed_samples 7900 unjoint_samples 7900 joint_samples 23 [173152, 1040042] processed_samples 7900 unjoint_samples 7900 joint_samples 23 [270168, 1046949] processed_samples 7900 unjoint_samples 7900 joint_samples 24 [1046959, 305001] processed_samples 7900 unjoint_samples 7900 joint_samples 23 [1047475, 338689] processed_samples 7900 unjoint_samples 7900 joint_samples 23 [180848, 1038095] [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure processed_samples 7900 unjoint_samples 7900 joint_samples 23 [912201, 1044116] processed_samples 7900 unjoint_samples 7900 joint_samples 22 [1047361, 270413] processed_samples 7900 unjoint_samples 7900 joint_samples 23 [153090, 1038342] processed_samples 7900 unjoint_samples 7900 joint_samples 23 [173152, 1040042] processed_samples 7900 unjoint_samples 7900 joint_samples 23 [270168, 1046949] processed_samples 7900 unjoint_samples 7900 joint_samples 24 [1046959, 305001] processed_samples 7900 unjoint_samples 7900 joint_samples 23 [1047475, 338689] processed_samples 7900 unjoint_samples 7900 joint_samples 23 [180848, 1038095] [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure processed_samples 7900 unjoint_samples 7900 joint_samples 23 [912201, 1044116] [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [2024-11-29 18:45:59] iteration 185/ 500 | consumed samples: 1480 | elapsed time per iteration (ms): 533290.0 | throughput per GPU (TFLOP/s/GPU): 97.6 | learning rate: 3.658740E-06 | global batch size: 8 | lm loss: 8.636235E-01 | loss scale: 1.0 | grad norm: 0.688 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure processed_samples 8000 unjoint_samples 8000 joint_samples 23 [524852, 1046949] processed_samples 8000 unjoint_samples 8000 joint_samples 23 [542277, 1038095] processed_samples 8000 unjoint_samples 8000 joint_samples 23 [420972, 1038342] processed_samples 8000 unjoint_samples 8000 joint_samples 23 [451751, 1040042] processed_samples 8000 unjoint_samples 8000 joint_samples 24 [1039352, 179325] processed_samples 8000 unjoint_samples 8000 joint_samples 24 [1046959, 604582] processed_samples 8000 unjoint_samples 8000 joint_samples 22 [1047361, 557644] processed_samples 8000 unjoint_samples 8000 joint_samples 23 [1047475, 823289] [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure processed_samples 8000 unjoint_samples 8000 joint_samples 23 [524852, 1046949] processed_samples 8000 unjoint_samples 8000 joint_samples 23 [542277, 1038095] processed_samples 8000 unjoint_samples 8000 joint_samples 23 [420972, 1038342] processed_samples 8000 unjoint_samples 8000 joint_samples 23 [451751, 1040042] processed_samples 8000 unjoint_samples 8000 joint_samples 22 [1047361, 557644] [h264 @ 0x55d51442bac0] mmco: unref short failure processed_samples 8000 unjoint_samples 8000 joint_samples 24 [1039352, 179325] processed_samples 8000 unjoint_samples 8000 joint_samples 24 [1046959, 604582] processed_samples 8000 unjoint_samples 8000 joint_samples 23 [1047475, 823289] [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b931500] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x55d51442bac0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [2024-11-29 18:56:18] iteration 186/ 500 | consumed samples: 1488 | elapsed time per iteration (ms): 619369.7 | throughput per GPU (TFLOP/s/GPU): 84.0 | learning rate: 3.644565E-06 | global batch size: 8 | lm loss: 8.731977E-01 | loss scale: 1.0 | grad norm: 0.936 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215ba43980] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure processed_samples 8100 unjoint_samples 8100 joint_samples 23 [682605, 1038342] processed_samples 8100 unjoint_samples 8100 joint_samples 23 [836071, 1046949] processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1047475, 92479] processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1046959, 885564] processed_samples 8100 unjoint_samples 8100 joint_samples 23 [841637, 1038095] processed_samples 8100 unjoint_samples 8100 joint_samples 23 [826311, 1040042] processed_samples 8100 unjoint_samples 8100 joint_samples 22 [1047361, 901929] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1039352, 435904] processed_samples 8100 unjoint_samples 8100 joint_samples 23 [841637, 1038095] processed_samples 8100 unjoint_samples 8100 joint_samples 23 [682605, 1038342] processed_samples 8100 unjoint_samples 8100 joint_samples 22 [1047361, 901929] [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1047475, 92479] processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1039352, 435904] processed_samples 8100 unjoint_samples 8100 joint_samples 23 [826311, 1040042] processed_samples 8100 unjoint_samples 8100 joint_samples 24 [1046959, 885564] processed_samples 8100 unjoint_samples 8100 joint_samples 23 [836071, 1046949] [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [2024-11-29 19:06:07] iteration 187/ 500 | consumed samples: 1496 | elapsed time per iteration (ms): 589183.6 | throughput per GPU (TFLOP/s/GPU): 88.3 | learning rate: 3.630344E-06 | global batch size: 8 | lm loss: 8.653762E-01 | loss scale: 1.0 | grad norm: 0.609 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [2024-11-29 19:15:54] iteration 188/ 500 | consumed samples: 1504 | elapsed time per iteration (ms): 586733.4 | throughput per GPU (TFLOP/s/GPU): 88.7 | learning rate: 3.616078E-06 | global batch size: 8 | lm loss: 8.779374E-01 | loss scale: 1.0 | grad norm: 0.729 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 19:24:21] iteration 189/ 500 | consumed samples: 1512 | elapsed time per iteration (ms): 506984.3 | throughput per GPU (TFLOP/s/GPU): 102.6 | learning rate: 3.601767E-06 | global batch size: 8 | lm loss: 8.729650E-01 | loss scale: 1.0 | grad norm: 0.679 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [2024-11-29 19:33:59] iteration 190/ 500 | consumed samples: 1520 | elapsed time per iteration (ms): 577936.4 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 3.587412E-06 | global batch size: 8 | lm loss: 8.692999E-01 | loss scale: 1.0 | grad norm: 0.624 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [2024-11-29 19:43:30] iteration 191/ 500 | consumed samples: 1528 | elapsed time per iteration (ms): 571544.2 | throughput per GPU (TFLOP/s/GPU): 91.1 | learning rate: 3.573013E-06 | global batch size: 8 | lm loss: 8.751049E-01 | loss scale: 1.0 | grad norm: 0.741 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure processed_samples 8200 unjoint_samples 8200 joint_samples 24 [38114, 1046907] processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1046784, 112401] processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1046784, 112401] processed_samples 8200 unjoint_samples 8200 joint_samples 24 [38114, 1046907] processed_samples 8200 unjoint_samples 8200 joint_samples 23 [127718, 1036268] processed_samples 8200 unjoint_samples 8200 joint_samples 25 [88897, 1046533] processed_samples 8200 unjoint_samples 8200 joint_samples 25 [88897, 1046533] processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1047475, 479969] processed_samples 8200 unjoint_samples 8200 joint_samples 23 [127718, 1036268] processed_samples 8200 unjoint_samples 8200 joint_samples 24 [166601, 1046949] processed_samples 8200 unjoint_samples 8200 joint_samples 24 [166601, 1046949] processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1047475, 479969] processed_samples 8200 unjoint_samples 8200 joint_samples 23 [978353, 1038342] processed_samples 8200 unjoint_samples 8200 joint_samples 23 [978353, 1038342] processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1039352, 658160] processed_samples 8200 unjoint_samples 8200 joint_samples 24 [1039352, 658160] [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [2024-11-29 19:52:35] iteration 192/ 500 | consumed samples: 1536 | elapsed time per iteration (ms): 544754.7 | throughput per GPU (TFLOP/s/GPU): 95.5 | learning rate: 3.558572E-06 | global batch size: 8 | lm loss: 8.747140E-01 | loss scale: 1.0 | grad norm: 0.574 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure processed_samples 8300 unjoint_samples 8300 joint_samples 25 [541212, 1046533] [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure processed_samples 8300 unjoint_samples 8300 joint_samples 24 [232124, 1045285] processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1046784, 502722] processed_samples 8300 unjoint_samples 8300 joint_samples 24 [471537, 1046949] processed_samples 8300 unjoint_samples 8300 joint_samples 24 [444120, 1046907] processed_samples 8300 unjoint_samples 8300 joint_samples 23 [530384, 1036268] processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1047475, 823090] [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1039352, 993587] [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure processed_samples 8300 unjoint_samples 8300 joint_samples 24 [232124, 1045285] processed_samples 8300 unjoint_samples 8300 joint_samples 24 [444120, 1046907] processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1046784, 502722] [h264 @ 0x55d5141fd280] mmco: unref short failure processed_samples 8300 unjoint_samples 8300 joint_samples 23 [530384, 1036268] processed_samples 8300 unjoint_samples 8300 joint_samples 25 [541212, 1046533] processed_samples 8300 unjoint_samples 8300 joint_samples 24 [471537, 1046949] processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1047475, 823090] [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure processed_samples 8300 unjoint_samples 8300 joint_samples 24 [1039352, 993587] [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [2024-11-29 20:02:10] iteration 193/ 500 | consumed samples: 1544 | elapsed time per iteration (ms): 574455.4 | throughput per GPU (TFLOP/s/GPU): 90.6 | learning rate: 3.544088E-06 | global batch size: 8 | lm loss: 8.469065E-01 | loss scale: 1.0 | grad norm: 0.945 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [2024-11-29 20:09:59] iteration 194/ 500 | consumed samples: 1552 | elapsed time per iteration (ms): 469151.3 | throughput per GPU (TFLOP/s/GPU): 110.9 | learning rate: 3.529562E-06 | global batch size: 8 | lm loss: 9.101482E-01 | loss scale: 1.0 | grad norm: 0.728 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b6ccc40] mmco: unref short failure [h264 @ 0x56215b6ccc40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure processed_samples 8400 unjoint_samples 8400 joint_samples 24 [671898, 1045285] processed_samples 8400 unjoint_samples 8400 joint_samples 24 [1046784, 764002] processed_samples 8400 unjoint_samples 8400 joint_samples 25 [269407, 1043464] processed_samples 8400 unjoint_samples 8400 joint_samples 25 [1047475, 100697] [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure processed_samples 8400 unjoint_samples 8400 joint_samples 25 [876980, 1046533] processed_samples 8400 unjoint_samples 8400 joint_samples 24 [866861, 1046949] processed_samples 8400 unjoint_samples 8400 joint_samples 23 [1015102, 1036268] processed_samples 8400 unjoint_samples 8400 joint_samples 24 [862943, 1046907] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure processed_samples 8400 unjoint_samples 8400 joint_samples 25 [269407, 1043464] processed_samples 8400 unjoint_samples 8400 joint_samples 25 [1047475, 100697] processed_samples 8400 unjoint_samples 8400 joint_samples 24 [671898, 1045285] processed_samples 8400 unjoint_samples 8400 joint_samples 25 [876980, 1046533] processed_samples 8400 unjoint_samples 8400 joint_samples 24 [1046784, 764002] processed_samples 8400 unjoint_samples 8400 joint_samples 24 [862943, 1046907] processed_samples 8400 unjoint_samples 8400 joint_samples 24 [866861, 1046949] processed_samples 8400 unjoint_samples 8400 joint_samples 23 [1015102, 1036268] [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [2024-11-29 20:21:04] iteration 195/ 500 | consumed samples: 1560 | elapsed time per iteration (ms): 664879.8 | throughput per GPU (TFLOP/s/GPU): 78.3 | learning rate: 3.514996E-06 | global batch size: 8 | lm loss: 8.420240E-01 | loss scale: 1.0 | grad norm: 0.561 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [2024-11-29 20:30:16] iteration 196/ 500 | consumed samples: 1568 | elapsed time per iteration (ms): 552301.5 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 3.500388E-06 | global batch size: 8 | lm loss: 8.576379E-01 | loss scale: 1.0 | grad norm: 0.711 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [2024-11-29 20:37:35] iteration 197/ 500 | consumed samples: 1576 | elapsed time per iteration (ms): 439104.5 | throughput per GPU (TFLOP/s/GPU): 118.5 | learning rate: 3.485741E-06 | global batch size: 8 | lm loss: 8.491023E-01 | loss scale: 1.0 | grad norm: 0.739 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d512dcb440] Missing reference picture, default is 65530 [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215ec3c040] Missing reference picture, default is 65530 [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [2024-11-29 20:51:16] iteration 198/ 500 | consumed samples: 1584 | elapsed time per iteration (ms): 821217.4 | throughput per GPU (TFLOP/s/GPU): 63.4 | learning rate: 3.471055E-06 | global batch size: 8 | lm loss: 8.837790E-01 | loss scale: 1.0 | grad norm: 1.253 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure processed_samples 8500 unjoint_samples 8500 joint_samples 24 [1040042, 256314] processed_samples 8500 unjoint_samples 8500 joint_samples 24 [1040042, 256314] processed_samples 8500 unjoint_samples 8500 joint_samples 25 [1047475, 407578] processed_samples 8500 unjoint_samples 8500 joint_samples 25 [1047475, 407578] processed_samples 8500 unjoint_samples 8500 joint_samples 25 [122087, 1046907] processed_samples 8500 unjoint_samples 8500 joint_samples 25 [122087, 1046907] processed_samples 8500 unjoint_samples 8500 joint_samples 25 [1032949, 217199] processed_samples 8500 unjoint_samples 8500 joint_samples 25 [1032949, 217199] processed_samples 8500 unjoint_samples 8500 joint_samples 25 [606117, 1043464] processed_samples 8500 unjoint_samples 8500 joint_samples 26 [100937, 1046533] processed_samples 8500 unjoint_samples 8500 joint_samples 26 [100937, 1046533] processed_samples 8500 unjoint_samples 8500 joint_samples 25 [606117, 1043464] processed_samples 8500 unjoint_samples 8500 joint_samples 24 [973145, 1045285] processed_samples 8500 unjoint_samples 8500 joint_samples 24 [973145, 1045285] processed_samples 8500 unjoint_samples 8500 joint_samples 24 [1046784, 1022117] processed_samples 8500 unjoint_samples 8500 joint_samples 24 [1046784, 1022117] [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [2024-11-29 21:01:32] iteration 199/ 500 | consumed samples: 1592 | elapsed time per iteration (ms): 615958.8 | throughput per GPU (TFLOP/s/GPU): 84.5 | learning rate: 3.456330E-06 | global batch size: 8 | lm loss: 8.532241E-01 | loss scale: 1.0 | grad norm: 0.683 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [2024-11-29 21:10:41] iteration 200/ 500 | consumed samples: 1600 | elapsed time per iteration (ms): 548343.7 | throughput per GPU (TFLOP/s/GPU): 94.9 | learning rate: 3.441567E-06 | global batch size: 8 | lm loss: 8.384378E-01 | loss scale: 1.0 | grad norm: 0.805 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (261620.54, 261621.03) [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure processed_samples 8600 unjoint_samples 8600 joint_samples 25 [167071, 1045285] [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1047181, 277034] processed_samples 8600 unjoint_samples 8600 joint_samples 25 [395778, 1046907] processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1032949, 580621] processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1047475, 684053] processed_samples 8600 unjoint_samples 8600 joint_samples 26 [439469, 1046533] [h264 @ 0x56215ec3c040] mmco: unref short failure processed_samples 8600 unjoint_samples 8600 joint_samples 24 [1040042, 611062] processed_samples 8600 unjoint_samples 8600 joint_samples 25 [856930, 1043464] processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1047475, 684053] processed_samples 8600 unjoint_samples 8600 joint_samples 25 [167071, 1045285] [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure processed_samples 8600 unjoint_samples 8600 joint_samples 26 [439469, 1046533] processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1047181, 277034] processed_samples 8600 unjoint_samples 8600 joint_samples 25 [395778, 1046907] processed_samples 8600 unjoint_samples 8600 joint_samples 25 [1032949, 580621] processed_samples 8600 unjoint_samples 8600 joint_samples 24 [1040042, 611062] [h264 @ 0x55d51a1f0840] mmco: unref short failure processed_samples 8600 unjoint_samples 8600 joint_samples 25 [856930, 1043464] [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [2024-11-29 21:26:31] iteration 201/ 500 | consumed samples: 1608 | elapsed time per iteration (ms): 688308.4 | throughput per GPU (TFLOP/s/GPU): 75.6 | learning rate: 3.426767E-06 | global batch size: 8 | lm loss: 8.486854E-01 | loss scale: 1.0 | grad norm: 0.611 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [2024-11-29 21:35:52] iteration 202/ 500 | consumed samples: 1616 | elapsed time per iteration (ms): 561512.8 | throughput per GPU (TFLOP/s/GPU): 92.7 | learning rate: 3.411930E-06 | global batch size: 8 | lm loss: 8.823071E-01 | loss scale: 1.0 | grad norm: 0.766 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure processed_samples 8700 unjoint_samples 8700 joint_samples 26 [1047475, 15799] processed_samples 8700 unjoint_samples 8700 joint_samples 26 [1047475, 15799] processed_samples 8700 unjoint_samples 8700 joint_samples 25 [905731, 1046907] processed_samples 8700 unjoint_samples 8700 joint_samples 25 [905731, 1046907] processed_samples 8700 unjoint_samples 8700 joint_samples 26 [1045874, 85168] processed_samples 8700 unjoint_samples 8700 joint_samples 26 [1045874, 85168] processed_samples 8700 unjoint_samples 8700 joint_samples 25 [1047181, 547941] processed_samples 8700 unjoint_samples 8700 joint_samples 24 [1040042, 863555] processed_samples 8700 unjoint_samples 8700 joint_samples 25 [549219, 1045285] processed_samples 8700 unjoint_samples 8700 joint_samples 25 [549219, 1045285] processed_samples 8700 unjoint_samples 8700 joint_samples 25 [1032949, 903571] processed_samples 8700 unjoint_samples 8700 joint_samples 25 [1047181, 547941] [h264 @ 0x56215b75cc40] mmco: unref short failure processed_samples 8700 unjoint_samples 8700 joint_samples 25 [1032949, 903571] processed_samples 8700 unjoint_samples 8700 joint_samples 24 [1040042, 863555] [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 8700 unjoint_samples 8700 joint_samples 26 [756152, 1046533] processed_samples 8700 unjoint_samples 8700 joint_samples 26 [756152, 1046533] [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [2024-11-29 21:45:23] iteration 203/ 500 | consumed samples: 1624 | elapsed time per iteration (ms): 570436.1 | throughput per GPU (TFLOP/s/GPU): 91.2 | learning rate: 3.397056E-06 | global batch size: 8 | lm loss: 7.746488E-01 | loss scale: 1.0 | grad norm: 0.692 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [2024-11-29 21:56:33] iteration 204/ 500 | consumed samples: 1632 | elapsed time per iteration (ms): 669819.5 | throughput per GPU (TFLOP/s/GPU): 77.7 | learning rate: 3.382147E-06 | global batch size: 8 | lm loss: 8.163378E-01 | loss scale: 1.0 | grad norm: 0.559 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 22:06:29] iteration 205/ 500 | consumed samples: 1640 | elapsed time per iteration (ms): 596426.7 | throughput per GPU (TFLOP/s/GPU): 87.3 | learning rate: 3.367203E-06 | global batch size: 8 | lm loss: 8.137987E-01 | loss scale: 1.0 | grad norm: 0.586 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure processed_samples 8800 unjoint_samples 8800 joint_samples 25 [1047277, 184480] processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1038728, 266508] processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1045874, 453231] processed_samples 8800 unjoint_samples 8800 joint_samples 26 [104759, 1046728] processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1047475, 320617] processed_samples 8800 unjoint_samples 8800 joint_samples 25 [873743, 1045285] processed_samples 8800 unjoint_samples 8800 joint_samples 26 [995735, 1046533] processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1038728, 266508] processed_samples 8800 unjoint_samples 8800 joint_samples 26 [104759, 1046728] processed_samples 8800 unjoint_samples 8800 joint_samples 25 [1047277, 184480] processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1047475, 320617] processed_samples 8800 unjoint_samples 8800 joint_samples 25 [1047181, 906245] processed_samples 8800 unjoint_samples 8800 joint_samples 25 [873743, 1045285] processed_samples 8800 unjoint_samples 8800 joint_samples 26 [1045874, 453231] processed_samples 8800 unjoint_samples 8800 joint_samples 25 [1047181, 906245] processed_samples 8800 unjoint_samples 8800 joint_samples 26 [995735, 1046533] [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [2024-11-29 22:15:48] iteration 206/ 500 | consumed samples: 1648 | elapsed time per iteration (ms): 559389.8 | throughput per GPU (TFLOP/s/GPU): 93.0 | learning rate: 3.352225E-06 | global batch size: 8 | lm loss: 8.419443E-01 | loss scale: 1.0 | grad norm: 0.620 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [2024-11-29 22:24:43] iteration 207/ 500 | consumed samples: 1656 | elapsed time per iteration (ms): 534930.3 | throughput per GPU (TFLOP/s/GPU): 97.3 | learning rate: 3.337214E-06 | global batch size: 8 | lm loss: 8.349214E-01 | loss scale: 1.0 | grad norm: 0.720 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [2024-11-29 22:34:40] iteration 208/ 500 | consumed samples: 1664 | elapsed time per iteration (ms): 596473.5 | throughput per GPU (TFLOP/s/GPU): 87.2 | learning rate: 3.322169E-06 | global batch size: 8 | lm loss: 8.279457E-01 | loss scale: 1.0 | grad norm: 0.641 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure processed_samples 8900 unjoint_samples 8900 joint_samples 26 [123257, 1046667] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [110864, 1045285] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1038728, 538213] processed_samples 8900 unjoint_samples 8900 joint_samples 25 [1047277, 683180] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [471169, 1046728] processed_samples 8900 unjoint_samples 8900 joint_samples 27 [1037217, 201421] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [123257, 1046667] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [110864, 1045285] processed_samples 8900 unjoint_samples 8900 joint_samples 27 [1037217, 201421] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1047475, 676775] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1038728, 538213] processed_samples 8900 unjoint_samples 8900 joint_samples 25 [1047277, 683180] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1045874, 725442] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1047475, 676775] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [471169, 1046728] processed_samples 8900 unjoint_samples 8900 joint_samples 26 [1045874, 725442] [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x56215ee0f040] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure processed_samples 9000 unjoint_samples 9000 joint_samples 25 [1047277, 950375] processed_samples 9000 unjoint_samples 9000 joint_samples 26 [512455, 1045285] processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1038728, 849792] processed_samples 9000 unjoint_samples 9000 joint_samples 27 [1037217, 530048] processed_samples 9000 unjoint_samples 9000 joint_samples 26 [464782, 1046667] processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1045874, 994156] [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1047475, 970385] [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure processed_samples 9000 unjoint_samples 9000 joint_samples 26 [704371, 1046728] [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 9000 unjoint_samples 9000 joint_samples 25 [1047277, 950375] [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure processed_samples 9000 unjoint_samples 9000 joint_samples 26 [512455, 1045285] processed_samples 9000 unjoint_samples 9000 joint_samples 26 [464782, 1046667] processed_samples 9000 unjoint_samples 9000 joint_samples 27 [1037217, 530048] processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1038728, 849792] processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1047475, 970385] processed_samples 9000 unjoint_samples 9000 joint_samples 26 [1045874, 994156] [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure processed_samples 9000 unjoint_samples 9000 joint_samples 26 [704371, 1046728] [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-11-29 22:46:50] iteration 209/ 500 | consumed samples: 1672 | elapsed time per iteration (ms): 730291.0 | throughput per GPU (TFLOP/s/GPU): 71.3 | learning rate: 3.307092E-06 | global batch size: 8 | lm loss: 8.303400E-01 | loss scale: 1.0 | grad norm: 0.633 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 22:55:07] iteration 210/ 500 | consumed samples: 1680 | elapsed time per iteration (ms): 497043.8 | throughput per GPU (TFLOP/s/GPU): 104.7 | learning rate: 3.291983E-06 | global batch size: 8 | lm loss: 7.948712E-01 | loss scale: 1.0 | grad norm: 0.621 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-29 23:04:42] iteration 211/ 500 | consumed samples: 1688 | elapsed time per iteration (ms): 574463.4 | throughput per GPU (TFLOP/s/GPU): 90.6 | learning rate: 3.276843E-06 | global batch size: 8 | lm loss: 9.035169E-01 | loss scale: 1.0 | grad norm: 0.650 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [2024-11-29 23:13:44] iteration 212/ 500 | consumed samples: 1696 | elapsed time per iteration (ms): 542007.3 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 3.261672E-06 | global batch size: 8 | lm loss: 9.377707E-01 | loss scale: 1.0 | grad norm: 0.808 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 9100 unjoint_samples 9100 joint_samples 26 [225650, 1046705] processed_samples 9100 unjoint_samples 9100 joint_samples 26 [795430, 1046667] processed_samples 9100 unjoint_samples 9100 joint_samples 27 [1045874, 296818] processed_samples 9100 unjoint_samples 9100 joint_samples 27 [250395, 1033542] processed_samples 9100 unjoint_samples 9100 joint_samples 27 [91770, 1047083] processed_samples 9100 unjoint_samples 9100 joint_samples 27 [1037217, 874982] processed_samples 9100 unjoint_samples 9100 joint_samples 26 [937867, 1046728] [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure processed_samples 9100 unjoint_samples 9100 joint_samples 27 [91770, 1047083] [h264 @ 0x56215f060d00] mmco: unref short failure processed_samples 9100 unjoint_samples 9100 joint_samples 26 [225650, 1046705] processed_samples 9100 unjoint_samples 9100 joint_samples 27 [250395, 1033542] processed_samples 9100 unjoint_samples 9100 joint_samples 26 [895416, 1045285] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 9100 unjoint_samples 9100 joint_samples 26 [937867, 1046728] processed_samples 9100 unjoint_samples 9100 joint_samples 26 [795430, 1046667] processed_samples 9100 unjoint_samples 9100 joint_samples 27 [1045874, 296818] processed_samples 9100 unjoint_samples 9100 joint_samples 26 [895416, 1045285] [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 9100 unjoint_samples 9100 joint_samples 27 [1037217, 874982] [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [2024-11-29 23:23:09] iteration 213/ 500 | consumed samples: 1704 | elapsed time per iteration (ms): 564882.0 | throughput per GPU (TFLOP/s/GPU): 92.1 | learning rate: 3.246472E-06 | global batch size: 8 | lm loss: 8.029411E-01 | loss scale: 1.0 | grad norm: 0.617 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [2024-11-29 23:31:09] iteration 214/ 500 | consumed samples: 1712 | elapsed time per iteration (ms): 480448.6 | throughput per GPU (TFLOP/s/GPU): 108.3 | learning rate: 3.231242E-06 | global batch size: 8 | lm loss: 8.489534E-01 | loss scale: 1.0 | grad norm: 0.604 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [2024-11-29 23:38:57] iteration 215/ 500 | consumed samples: 1720 | elapsed time per iteration (ms): 468395.2 | throughput per GPU (TFLOP/s/GPU): 111.1 | learning rate: 3.215984E-06 | global batch size: 8 | lm loss: 8.099984E-01 | loss scale: 1.0 | grad norm: 0.606 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [2024-11-29 23:49:10] iteration 216/ 500 | consumed samples: 1728 | elapsed time per iteration (ms): 612723.6 | throughput per GPU (TFLOP/s/GPU): 84.9 | learning rate: 3.200697E-06 | global batch size: 8 | lm loss: 8.494784E-01 | loss scale: 1.0 | grad norm: 0.565 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure processed_samples 9200 unjoint_samples 9200 joint_samples 27 [73991, 1046667] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [1012937, 220560] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [172362, 1046728] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [1045874, 648534] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [73991, 1046667] processed_samples 9200 unjoint_samples 9200 joint_samples 28 [154822, 1039938] processed_samples 9200 unjoint_samples 9200 joint_samples 26 [530122, 1046705] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [172362, 1046728] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [1012937, 220560] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [462192, 1033542] processed_samples 9200 unjoint_samples 9200 joint_samples 28 [154822, 1039938] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [465065, 1047083] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [465065, 1047083] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [462192, 1033542] processed_samples 9200 unjoint_samples 9200 joint_samples 26 [530122, 1046705] processed_samples 9200 unjoint_samples 9200 joint_samples 27 [1045874, 648534] [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215bb389c0] mmco: unref short failure [h264 @ 0x56215bb389c0] mmco: unref short failure [h264 @ 0x56215bb389c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb389c0] mmco: unref short failure [h264 @ 0x56215bb389c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure processed_samples 9300 unjoint_samples 9300 joint_samples 27 [1012937, 522162] processed_samples 9300 unjoint_samples 9300 joint_samples 27 [460248, 1046728] processed_samples 9300 unjoint_samples 9300 joint_samples 27 [882129, 1033542] processed_samples 9300 unjoint_samples 9300 joint_samples 26 [908179, 1046705] processed_samples 9300 unjoint_samples 9300 joint_samples 28 [552738, 1039938] processed_samples 9300 unjoint_samples 9300 joint_samples 27 [492165, 1046667] processed_samples 9300 unjoint_samples 9300 joint_samples 27 [1045874, 936492] processed_samples 9300 unjoint_samples 9300 joint_samples 27 [810535, 1047083] [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure processed_samples 9300 unjoint_samples 9300 joint_samples 27 [492165, 1046667] processed_samples 9300 unjoint_samples 9300 joint_samples 27 [460248, 1046728] processed_samples 9300 unjoint_samples 9300 joint_samples 27 [1012937, 522162] processed_samples 9300 unjoint_samples 9300 joint_samples 27 [882129, 1033542] processed_samples 9300 unjoint_samples 9300 joint_samples 26 [908179, 1046705] processed_samples 9300 unjoint_samples 9300 joint_samples 28 [552738, 1039938] processed_samples 9300 unjoint_samples 9300 joint_samples 27 [810535, 1047083] [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure processed_samples 9300 unjoint_samples 9300 joint_samples 27 [1045874, 936492] [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [2024-11-30 00:00:32] iteration 217/ 500 | consumed samples: 1736 | elapsed time per iteration (ms): 682231.1 | throughput per GPU (TFLOP/s/GPU): 76.3 | learning rate: 3.185384E-06 | global batch size: 8 | lm loss: 8.043656E-01 | loss scale: 1.0 | grad norm: 0.589 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [2024-11-30 00:08:50] iteration 218/ 500 | consumed samples: 1744 | elapsed time per iteration (ms): 497708.4 | throughput per GPU (TFLOP/s/GPU): 104.6 | learning rate: 3.170044E-06 | global batch size: 8 | lm loss: 9.035359E-01 | loss scale: 1.0 | grad norm: 0.570 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [2024-11-30 00:19:10] iteration 219/ 500 | consumed samples: 1752 | elapsed time per iteration (ms): 619806.1 | throughput per GPU (TFLOP/s/GPU): 84.0 | learning rate: 3.154678E-06 | global batch size: 8 | lm loss: 7.692378E-01 | loss scale: 1.0 | grad norm: 0.671 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [2024-11-30 00:27:23] iteration 220/ 500 | consumed samples: 1760 | elapsed time per iteration (ms): 493399.7 | throughput per GPU (TFLOP/s/GPU): 105.5 | learning rate: 3.139286E-06 | global batch size: 8 | lm loss: 8.335562E-01 | loss scale: 1.0 | grad norm: 0.625 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (272546.17, 272546.63) [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure processed_samples 9400 unjoint_samples 9400 joint_samples 27 [206860, 1046819] processed_samples 9400 unjoint_samples 9400 joint_samples 27 [924978, 1046667] processed_samples 9400 unjoint_samples 9400 joint_samples 28 [158827, 1047083] processed_samples 9400 unjoint_samples 9400 joint_samples 28 [220743, 1046128] processed_samples 9400 unjoint_samples 9400 joint_samples 28 [1044569, 139781] processed_samples 9400 unjoint_samples 9400 joint_samples 27 [1012937, 784226] processed_samples 9400 unjoint_samples 9400 joint_samples 28 [809095, 1039938] processed_samples 9400 unjoint_samples 9400 joint_samples 27 [876727, 1046728] [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure processed_samples 9400 unjoint_samples 9400 joint_samples 27 [206860, 1046819] processed_samples 9400 unjoint_samples 9400 joint_samples 27 [924978, 1046667] [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure processed_samples 9400 unjoint_samples 9400 joint_samples 27 [1012937, 784226] [h264 @ 0x55d51376ab40] mmco: unref short failure processed_samples 9400 unjoint_samples 9400 joint_samples 28 [158827, 1047083] [h264 @ 0x55d51376ab40] mmco: unref short failure processed_samples 9400 unjoint_samples 9400 joint_samples 28 [1044569, 139781] processed_samples 9400 unjoint_samples 9400 joint_samples 28 [220743, 1046128] [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure processed_samples 9400 unjoint_samples 9400 joint_samples 28 [809095, 1039938] [h264 @ 0x55d51326c900] mmco: unref short failure processed_samples 9400 unjoint_samples 9400 joint_samples 27 [876727, 1046728] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [2024-11-30 00:41:52] iteration 221/ 500 | consumed samples: 1768 | elapsed time per iteration (ms): 596070.1 | throughput per GPU (TFLOP/s/GPU): 87.3 | learning rate: 3.123870E-06 | global batch size: 8 | lm loss: 8.337880E-01 | loss scale: 1.0 | grad norm: 0.659 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [2024-11-30 00:50:18] iteration 222/ 500 | consumed samples: 1776 | elapsed time per iteration (ms): 505900.6 | throughput per GPU (TFLOP/s/GPU): 102.9 | learning rate: 3.108430E-06 | global batch size: 8 | lm loss: 8.406662E-01 | loss scale: 1.0 | grad norm: 0.617 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [2024-11-30 00:59:00] iteration 223/ 500 | consumed samples: 1784 | elapsed time per iteration (ms): 521908.2 | throughput per GPU (TFLOP/s/GPU): 99.7 | learning rate: 3.092966E-06 | global batch size: 8 | lm loss: 8.790482E-01 | loss scale: 1.0 | grad norm: 0.600 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [2024-11-30 01:07:02] iteration 224/ 500 | consumed samples: 1792 | elapsed time per iteration (ms): 481628.5 | throughput per GPU (TFLOP/s/GPU): 108.1 | learning rate: 3.077479E-06 | global batch size: 8 | lm loss: 8.450036E-01 | loss scale: 1.0 | grad norm: 0.597 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 9500 unjoint_samples 9500 joint_samples 28 [137705, 1019544] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [137705, 1019544] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1040380, 142585] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1040380, 142585] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1038227, 242054] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1038227, 242054] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [410129, 1047083] processed_samples 9500 unjoint_samples 9500 joint_samples 29 [1024028, 124413] processed_samples 9500 unjoint_samples 9500 joint_samples 29 [1024028, 124413] [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure processed_samples 9500 unjoint_samples 9500 joint_samples 27 [512095, 1046819] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [410129, 1047083] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [629736, 1046128] processed_samples 9500 unjoint_samples 9500 joint_samples 27 [512095, 1046819] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [629736, 1046128] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1044569, 396197] processed_samples 9500 unjoint_samples 9500 joint_samples 28 [1044569, 396197] [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1038227, 628474] processed_samples 9600 unjoint_samples 9600 joint_samples 28 [478086, 1019544] processed_samples 9600 unjoint_samples 9600 joint_samples 29 [1024028, 448485] processed_samples 9600 unjoint_samples 9600 joint_samples 28 [669891, 1047083] processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1040380, 432600] [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1044569, 719903] processed_samples 9600 unjoint_samples 9600 joint_samples 27 [795874, 1046819] processed_samples 9600 unjoint_samples 9600 joint_samples 28 [920557, 1046128] [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 9600 unjoint_samples 9600 joint_samples 28 [478086, 1019544] processed_samples 9600 unjoint_samples 9600 joint_samples 28 [669891, 1047083] processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1040380, 432600] processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1038227, 628474] processed_samples 9600 unjoint_samples 9600 joint_samples 29 [1024028, 448485] processed_samples 9600 unjoint_samples 9600 joint_samples 28 [920557, 1046128] processed_samples 9600 unjoint_samples 9600 joint_samples 27 [795874, 1046819] processed_samples 9600 unjoint_samples 9600 joint_samples 28 [1044569, 719903] [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [2024-11-30 01:18:04] iteration 225/ 500 | consumed samples: 1800 | elapsed time per iteration (ms): 662503.8 | throughput per GPU (TFLOP/s/GPU): 78.6 | learning rate: 3.061971E-06 | global batch size: 8 | lm loss: 7.138479E-01 | loss scale: 1.0 | grad norm: 0.706 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [2024-11-30 01:26:49] iteration 226/ 500 | consumed samples: 1808 | elapsed time per iteration (ms): 525016.6 | throughput per GPU (TFLOP/s/GPU): 99.1 | learning rate: 3.046440E-06 | global batch size: 8 | lm loss: 8.335893E-01 | loss scale: 1.0 | grad norm: 0.724 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1013319, 298023] processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1040380, 753225] processed_samples 9700 unjoint_samples 9700 joint_samples 29 [1028990, 185829] processed_samples 9700 unjoint_samples 9700 joint_samples 29 [1024028, 795797] processed_samples 9700 unjoint_samples 9700 joint_samples 28 [805458, 1019544] processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1044569, 1033781] processed_samples 9700 unjoint_samples 9700 joint_samples 28 [994371, 1047083] processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1038227, 910477] [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1040380, 753225] [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 9700 unjoint_samples 9700 joint_samples 29 [1028990, 185829] processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1013319, 298023] processed_samples 9700 unjoint_samples 9700 joint_samples 28 [994371, 1047083] processed_samples 9700 unjoint_samples 9700 joint_samples 28 [805458, 1019544] processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1038227, 910477] processed_samples 9700 unjoint_samples 9700 joint_samples 29 [1024028, 795797] processed_samples 9700 unjoint_samples 9700 joint_samples 28 [1044569, 1033781] [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [2024-11-30 01:39:17] iteration 227/ 500 | consumed samples: 1816 | elapsed time per iteration (ms): 747736.8 | throughput per GPU (TFLOP/s/GPU): 69.6 | learning rate: 3.030889E-06 | global batch size: 8 | lm loss: 8.548321E-01 | loss scale: 1.0 | grad norm: 0.774 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [2024-11-30 01:49:00] iteration 228/ 500 | consumed samples: 1824 | elapsed time per iteration (ms): 583444.9 | throughput per GPU (TFLOP/s/GPU): 89.2 | learning rate: 3.015318E-06 | global batch size: 8 | lm loss: 8.605866E-01 | loss scale: 1.0 | grad norm: 1.385 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215d202940] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [2024-11-30 01:58:18] iteration 229/ 500 | consumed samples: 1832 | elapsed time per iteration (ms): 558010.4 | throughput per GPU (TFLOP/s/GPU): 93.3 | learning rate: 2.999727E-06 | global batch size: 8 | lm loss: 8.125361E-01 | loss scale: 1.0 | grad norm: 0.725 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [2024-11-30 02:06:47] iteration 230/ 500 | consumed samples: 1840 | elapsed time per iteration (ms): 509049.1 | throughput per GPU (TFLOP/s/GPU): 102.2 | learning rate: 2.984118E-06 | global batch size: 8 | lm loss: 9.170176E-01 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 02:15:32] iteration 231/ 500 | consumed samples: 1848 | elapsed time per iteration (ms): 524853.1 | throughput per GPU (TFLOP/s/GPU): 99.2 | learning rate: 2.968490E-06 | global batch size: 8 | lm loss: 8.902583E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215afffc40] mmco: unref short failure [h264 @ 0x56215afffc40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure processed_samples 9800 unjoint_samples 9800 joint_samples 29 [203755, 1047083] processed_samples 9800 unjoint_samples 9800 joint_samples 28 [1013319, 591862] processed_samples 9800 unjoint_samples 9800 joint_samples 29 [81067, 1039558] processed_samples 9800 unjoint_samples 9800 joint_samples 29 [290606, 1046311] processed_samples 9800 unjoint_samples 9800 joint_samples 29 [197089, 1044398] processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1038227, 125259] [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1028990, 462177] processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1024028, 1011840] [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 9800 unjoint_samples 9800 joint_samples 29 [203755, 1047083] processed_samples 9800 unjoint_samples 9800 joint_samples 28 [1013319, 591862] processed_samples 9800 unjoint_samples 9800 joint_samples 29 [81067, 1039558] processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1038227, 125259] processed_samples 9800 unjoint_samples 9800 joint_samples 29 [197089, 1044398] processed_samples 9800 unjoint_samples 9800 joint_samples 29 [290606, 1046311] [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516a44800] mmco: unref short failure [h264 @ 0x55d516a44800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1028990, 462177] processed_samples 9800 unjoint_samples 9800 joint_samples 29 [1024028, 1011840] [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [2024-11-30 02:22:56] iteration 232/ 500 | consumed samples: 1856 | elapsed time per iteration (ms): 443719.1 | throughput per GPU (TFLOP/s/GPU): 117.3 | learning rate: 2.952845E-06 | global batch size: 8 | lm loss: 7.840450E-01 | loss scale: 1.0 | grad norm: 0.627 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure processed_samples 9900 unjoint_samples 9900 joint_samples 29 [386430, 1039558] processed_samples 9900 unjoint_samples 9900 joint_samples 30 [1031356, 468096] [h264 @ 0x56215d8ddd40] mmco: unref short failure processed_samples 9900 unjoint_samples 9900 joint_samples 29 [708788, 1044398] processed_samples 9900 unjoint_samples 9900 joint_samples 29 [1038227, 385286] processed_samples 9900 unjoint_samples 9900 joint_samples 28 [1013319, 953069] processed_samples 9900 unjoint_samples 9900 joint_samples 29 [584501, 1047083] processed_samples 9900 unjoint_samples 9900 joint_samples 29 [588057, 1046311] processed_samples 9900 unjoint_samples 9900 joint_samples 29 [1028990, 714164] [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure processed_samples 9900 unjoint_samples 9900 joint_samples 29 [584501, 1047083] [h264 @ 0x55d51407fb00] mmco: unref short failure processed_samples 9900 unjoint_samples 9900 joint_samples 29 [386430, 1039558] processed_samples 9900 unjoint_samples 9900 joint_samples 30 [1031356, 468096] processed_samples 9900 unjoint_samples 9900 joint_samples 29 [1038227, 385286] processed_samples 9900 unjoint_samples 9900 joint_samples 29 [588057, 1046311] processed_samples 9900 unjoint_samples 9900 joint_samples 29 [1028990, 714164] processed_samples 9900 unjoint_samples 9900 joint_samples 29 [708788, 1044398] [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure processed_samples 9900 unjoint_samples 9900 joint_samples 28 [1013319, 953069] [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51327b000] mmco: unref short failure [h264 @ 0x55d51327b000] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [2024-11-30 02:32:45] iteration 233/ 500 | consumed samples: 1864 | elapsed time per iteration (ms): 589102.1 | throughput per GPU (TFLOP/s/GPU): 88.3 | learning rate: 2.937183E-06 | global batch size: 8 | lm loss: 8.486030E-01 | loss scale: 1.0 | grad norm: 0.649 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [2024-11-30 02:42:05] iteration 234/ 500 | consumed samples: 1872 | elapsed time per iteration (ms): 559559.4 | throughput per GPU (TFLOP/s/GPU): 93.0 | learning rate: 2.921504E-06 | global batch size: 8 | lm loss: 8.358369E-01 | loss scale: 1.0 | grad norm: 0.590 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1048318, 146360] [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure processed_samples 10000 unjoint_samples 10000 joint_samples 30 [1039497, 93104] processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1038227, 686845] processed_samples 10000 unjoint_samples 10000 joint_samples 30 [1031356, 786194] processed_samples 10000 unjoint_samples 10000 joint_samples 29 [892478, 1046311] processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1028990, 1031122] processed_samples 10000 unjoint_samples 10000 joint_samples 29 [674954, 1039558] processed_samples 10000 unjoint_samples 10000 joint_samples 30 [1039497, 93104] processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1048318, 146360] processed_samples 10000 unjoint_samples 10000 joint_samples 29 [892478, 1046311] processed_samples 10000 unjoint_samples 10000 joint_samples 30 [1031356, 786194] processed_samples 10000 unjoint_samples 10000 joint_samples 29 [964007, 1047083] processed_samples 10000 unjoint_samples 10000 joint_samples 29 [674954, 1039558] [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1038227, 686845] processed_samples 10000 unjoint_samples 10000 joint_samples 29 [964007, 1047083] processed_samples 10000 unjoint_samples 10000 joint_samples 29 [1028990, 1031122] [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [2024-11-30 02:52:33] iteration 235/ 500 | consumed samples: 1880 | elapsed time per iteration (ms): 628444.9 | throughput per GPU (TFLOP/s/GPU): 82.8 | learning rate: 2.905810E-06 | global batch size: 8 | lm loss: 8.173471E-01 | loss scale: 1.0 | grad norm: 0.545 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 03:02:30] iteration 236/ 500 | consumed samples: 1888 | elapsed time per iteration (ms): 597310.8 | throughput per GPU (TFLOP/s/GPU): 87.1 | learning rate: 2.890101E-06 | global batch size: 8 | lm loss: 7.991673E-01 | loss scale: 1.0 | grad norm: 0.632 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 03:14:13] iteration 237/ 500 | consumed samples: 1896 | elapsed time per iteration (ms): 702636.3 | throughput per GPU (TFLOP/s/GPU): 74.1 | learning rate: 2.874378E-06 | global batch size: 8 | lm loss: 8.239874E-01 | loss scale: 1.0 | grad norm: 0.683 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure processed_samples 10100 unjoint_samples 10100 joint_samples 29 [1048318, 428681] [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1039497, 395314] processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1045781, 248108] processed_samples 10100 unjoint_samples 10100 joint_samples 30 [959819, 399025] processed_samples 10100 unjoint_samples 10100 joint_samples 29 [1038227, 931388] [h264 @ 0x55d5137029c0] mmco: unref short failure processed_samples 10100 unjoint_samples 10100 joint_samples 29 [911937, 1039558] [h264 @ 0x55d5137029c0] mmco: unref short failure processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1031356, 1034010] processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1048096, 356039] [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure processed_samples 10100 unjoint_samples 10100 joint_samples 29 [1048318, 428681] processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1039497, 395314] processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1045781, 248108] processed_samples 10100 unjoint_samples 10100 joint_samples 30 [959819, 399025] processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1031356, 1034010] processed_samples 10100 unjoint_samples 10100 joint_samples 29 [911937, 1039558] [h264 @ 0x56215d4974c0] mmco: unref short failure processed_samples 10100 unjoint_samples 10100 joint_samples 29 [1038227, 931388] [h264 @ 0x56215d4974c0] mmco: unref short failure processed_samples 10100 unjoint_samples 10100 joint_samples 30 [1048096, 356039] [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [2024-11-30 03:23:23] iteration 238/ 500 | consumed samples: 1904 | elapsed time per iteration (ms): 550026.1 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 2.858641E-06 | global batch size: 8 | lm loss: 8.139384E-01 | loss scale: 1.0 | grad norm: 0.677 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [2024-11-30 03:33:08] iteration 239/ 500 | consumed samples: 1912 | elapsed time per iteration (ms): 584777.9 | throughput per GPU (TFLOP/s/GPU): 89.0 | learning rate: 2.842891E-06 | global batch size: 8 | lm loss: 8.176109E-01 | loss scale: 1.0 | grad norm: 0.641 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [2024-11-30 03:41:47] iteration 240/ 500 | consumed samples: 1920 | elapsed time per iteration (ms): 519125.6 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 2.827129E-06 | global batch size: 8 | lm loss: 7.934645E-01 | loss scale: 1.0 | grad norm: 0.745 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (274797.42, 274798.34) [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure processed_samples 10200 unjoint_samples 10200 joint_samples 31 [198851, 1047383] processed_samples 10200 unjoint_samples 10200 joint_samples 30 [235979, 1041682] processed_samples 10200 unjoint_samples 10200 joint_samples 30 [363659, 1010132] processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1045781, 514167] processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1039497, 626871] processed_samples 10200 unjoint_samples 10200 joint_samples 29 [1048318, 719459] processed_samples 10200 unjoint_samples 10200 joint_samples 30 [959819, 714686] processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1048096, 696454] [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure processed_samples 10200 unjoint_samples 10200 joint_samples 30 [363659, 1010132] processed_samples 10200 unjoint_samples 10200 joint_samples 31 [198851, 1047383] processed_samples 10200 unjoint_samples 10200 joint_samples 29 [1048318, 719459] processed_samples 10200 unjoint_samples 10200 joint_samples 30 [235979, 1041682] [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1039497, 626871] processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1045781, 514167] processed_samples 10200 unjoint_samples 10200 joint_samples 30 [959819, 714686] processed_samples 10200 unjoint_samples 10200 joint_samples 30 [1048096, 696454] [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215b931500] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [2024-11-30 03:58:00] iteration 241/ 500 | consumed samples: 1928 | elapsed time per iteration (ms): 697775.1 | throughput per GPU (TFLOP/s/GPU): 74.6 | learning rate: 2.811355E-06 | global batch size: 8 | lm loss: 8.460391E-01 | loss scale: 1.0 | grad norm: 0.582 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [2024-11-30 04:09:17] iteration 242/ 500 | consumed samples: 1936 | elapsed time per iteration (ms): 676982.5 | throughput per GPU (TFLOP/s/GPU): 76.9 | learning rate: 2.795570E-06 | global batch size: 8 | lm loss: 8.695595E-01 | loss scale: 1.0 | grad norm: 0.695 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 10300 unjoint_samples 10300 joint_samples 30 [669771, 1010132] processed_samples 10300 unjoint_samples 10300 joint_samples 30 [80313, 1011814] processed_samples 10300 unjoint_samples 10300 joint_samples 31 [39135, 1034652] processed_samples 10300 unjoint_samples 10300 joint_samples 30 [1045781, 761153] processed_samples 10300 unjoint_samples 10300 joint_samples 30 [460680, 1041682] processed_samples 10300 unjoint_samples 10300 joint_samples 30 [1039497, 933155] processed_samples 10300 unjoint_samples 10300 joint_samples 30 [966289, 967539] [h264 @ 0x56215c18da40] mmco: unref short failure processed_samples 10300 unjoint_samples 10300 joint_samples 31 [551478, 1047383] processed_samples 10300 unjoint_samples 10300 joint_samples 30 [669771, 1010132] processed_samples 10300 unjoint_samples 10300 joint_samples 30 [80313, 1011814] [h264 @ 0x56215ee100c0] mmco: unref short failure processed_samples 10300 unjoint_samples 10300 joint_samples 31 [39135, 1034652] processed_samples 10300 unjoint_samples 10300 joint_samples 30 [1045781, 761153] processed_samples 10300 unjoint_samples 10300 joint_samples 30 [460680, 1041682] processed_samples 10300 unjoint_samples 10300 joint_samples 31 [551478, 1047383] processed_samples 10300 unjoint_samples 10300 joint_samples 30 [1039497, 933155] [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure processed_samples 10300 unjoint_samples 10300 joint_samples 30 [966289, 967539] [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [2024-11-30 04:17:54] iteration 243/ 500 | consumed samples: 1944 | elapsed time per iteration (ms): 517429.4 | throughput per GPU (TFLOP/s/GPU): 100.6 | learning rate: 2.779775E-06 | global batch size: 8 | lm loss: 8.456820E-01 | loss scale: 1.0 | grad norm: 0.564 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [2024-11-30 04:28:27] iteration 244/ 500 | consumed samples: 1952 | elapsed time per iteration (ms): 633075.7 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 2.763971E-06 | global batch size: 8 | lm loss: 8.324536E-01 | loss scale: 1.0 | grad norm: 0.590 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure processed_samples 10400 unjoint_samples 10400 joint_samples 30 [435968, 1011814] [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure processed_samples 10400 unjoint_samples 10400 joint_samples 30 [1045781, 975766] [h264 @ 0x56215be9ff00] mmco: unref short failure processed_samples 10400 unjoint_samples 10400 joint_samples 31 [183452, 1041048] processed_samples 10400 unjoint_samples 10400 joint_samples 31 [844115, 1047383] processed_samples 10400 unjoint_samples 10400 joint_samples 31 [426840, 1034652] processed_samples 10400 unjoint_samples 10400 joint_samples 31 [172853, 1038532] processed_samples 10400 unjoint_samples 10400 joint_samples 30 [776971, 1041682] processed_samples 10400 unjoint_samples 10400 joint_samples 30 [977505, 1010132] [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure processed_samples 10400 unjoint_samples 10400 joint_samples 30 [435968, 1011814] processed_samples 10400 unjoint_samples 10400 joint_samples 31 [172853, 1038532] processed_samples 10400 unjoint_samples 10400 joint_samples 30 [977505, 1010132] [h264 @ 0x55d512fcdfc0] mmco: unref short failure processed_samples 10400 unjoint_samples 10400 joint_samples 31 [844115, 1047383] processed_samples 10400 unjoint_samples 10400 joint_samples 31 [183452, 1041048] processed_samples 10400 unjoint_samples 10400 joint_samples 30 [776971, 1041682] processed_samples 10400 unjoint_samples 10400 joint_samples 31 [426840, 1034652] [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure processed_samples 10400 unjoint_samples 10400 joint_samples 30 [1045781, 975766] [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [2024-11-30 04:41:34] iteration 245/ 500 | consumed samples: 1960 | elapsed time per iteration (ms): 786820.4 | throughput per GPU (TFLOP/s/GPU): 66.1 | learning rate: 2.748157E-06 | global batch size: 8 | lm loss: 8.650105E-01 | loss scale: 1.0 | grad norm: 0.570 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [2024-11-30 04:49:37] iteration 246/ 500 | consumed samples: 1968 | elapsed time per iteration (ms): 483113.6 | throughput per GPU (TFLOP/s/GPU): 107.7 | learning rate: 2.732335E-06 | global batch size: 8 | lm loss: 8.726799E-01 | loss scale: 1.0 | grad norm: 0.605 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [2024-11-30 04:59:17] iteration 247/ 500 | consumed samples: 1976 | elapsed time per iteration (ms): 580058.3 | throughput per GPU (TFLOP/s/GPU): 89.7 | learning rate: 2.716506E-06 | global batch size: 8 | lm loss: 8.287100E-01 | loss scale: 1.0 | grad norm: 0.680 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure processed_samples 10500 unjoint_samples 10500 joint_samples 31 [1033447, 194045] [h264 @ 0x56215bcafa40] mmco: unref short failure processed_samples 10500 unjoint_samples 10500 joint_samples 31 [467014, 1038532] processed_samples 10500 unjoint_samples 10500 joint_samples 32 [156504, 1048173] processed_samples 10500 unjoint_samples 10500 joint_samples 31 [218036, 1034239] processed_samples 10500 unjoint_samples 10500 joint_samples 30 [782697, 1011814] processed_samples 10500 unjoint_samples 10500 joint_samples 31 [461965, 1041048] processed_samples 10500 unjoint_samples 10500 joint_samples 31 [1033447, 194045] processed_samples 10500 unjoint_samples 10500 joint_samples 31 [467014, 1038532] processed_samples 10500 unjoint_samples 10500 joint_samples 30 [1041885, 1041682] [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure processed_samples 10500 unjoint_samples 10500 joint_samples 31 [712238, 1034652] processed_samples 10500 unjoint_samples 10500 joint_samples 30 [782697, 1011814] [h264 @ 0x55d5141fd280] mmco: unref short failure processed_samples 10500 unjoint_samples 10500 joint_samples 32 [156504, 1048173] processed_samples 10500 unjoint_samples 10500 joint_samples 31 [218036, 1034239] [h264 @ 0x5621618e98c0] mmco: unref short failure processed_samples 10500 unjoint_samples 10500 joint_samples 31 [461965, 1041048] processed_samples 10500 unjoint_samples 10500 joint_samples 31 [712238, 1034652] processed_samples 10500 unjoint_samples 10500 joint_samples 30 [1041885, 1041682] [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [2024-11-30 05:07:22] iteration 248/ 500 | consumed samples: 1984 | elapsed time per iteration (ms): 484634.2 | throughput per GPU (TFLOP/s/GPU): 107.4 | learning rate: 2.700669E-06 | global batch size: 8 | lm loss: 8.482005E-01 | loss scale: 1.0 | grad norm: 0.622 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [2024-11-30 05:15:43] iteration 249/ 500 | consumed samples: 1992 | elapsed time per iteration (ms): 501138.8 | throughput per GPU (TFLOP/s/GPU): 103.8 | learning rate: 2.684826E-06 | global batch size: 8 | lm loss: 8.958365E-01 | loss scale: 1.0 | grad norm: 0.827 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [2024-11-30 05:23:37] iteration 250/ 500 | consumed samples: 2000 | elapsed time per iteration (ms): 473859.9 | throughput per GPU (TFLOP/s/GPU): 109.8 | learning rate: 2.668977E-06 | global batch size: 8 | lm loss: 8.291722E-01 | loss scale: 1.0 | grad norm: 0.640 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 10600 unjoint_samples 10600 joint_samples 32 [1041514, 11719] processed_samples 10600 unjoint_samples 10600 joint_samples 32 [1041514, 11719] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure processed_samples 10600 unjoint_samples 10600 joint_samples 31 [1037875, 96482] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [1037875, 96482] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [306471, 1047224] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [306471, 1047224] processed_samples 10600 unjoint_samples 10600 joint_samples 32 [383211, 1048173] processed_samples 10600 unjoint_samples 10600 joint_samples 32 [383211, 1048173] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [447013, 1034239] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [447013, 1034239] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [1033447, 583281] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [1033447, 583281] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [774199, 1038532] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [698356, 1041048] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [698356, 1041048] processed_samples 10600 unjoint_samples 10600 joint_samples 31 [774199, 1038532] [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [2024-11-30 05:32:01] iteration 251/ 500 | consumed samples: 2008 | elapsed time per iteration (ms): 503831.0 | throughput per GPU (TFLOP/s/GPU): 103.3 | learning rate: 2.653124E-06 | global batch size: 8 | lm loss: 8.277208E-01 | loss scale: 1.0 | grad norm: 0.663 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 10700 unjoint_samples 10700 joint_samples 32 [128410, 1045523] processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1037875, 472499] processed_samples 10700 unjoint_samples 10700 joint_samples 31 [624422, 1047224] processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1033447, 841174] processed_samples 10700 unjoint_samples 10700 joint_samples 32 [1041514, 312682] processed_samples 10700 unjoint_samples 10700 joint_samples 32 [128410, 1045523] processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1043363, 1044551] processed_samples 10700 unjoint_samples 10700 joint_samples 32 [704600, 1048173] processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1037875, 472499] processed_samples 10700 unjoint_samples 10700 joint_samples 31 [624422, 1047224] processed_samples 10700 unjoint_samples 10700 joint_samples 32 [1041514, 312682] processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1033447, 841174] [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 10700 unjoint_samples 10700 joint_samples 31 [1043363, 1044551] processed_samples 10700 unjoint_samples 10700 joint_samples 31 [892183, 1034239] processed_samples 10700 unjoint_samples 10700 joint_samples 32 [704600, 1048173] [h264 @ 0x56215ec3c040] mmco: unref short failure processed_samples 10700 unjoint_samples 10700 joint_samples 31 [892183, 1034239] [2024-11-30 05:40:40] iteration 252/ 500 | consumed samples: 2016 | elapsed time per iteration (ms): 519264.0 | throughput per GPU (TFLOP/s/GPU): 100.2 | learning rate: 2.637266E-06 | global batch size: 8 | lm loss: 8.455402E-01 | loss scale: 1.0 | grad norm: 0.609 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [2024-11-30 05:50:40] iteration 253/ 500 | consumed samples: 2024 | elapsed time per iteration (ms): 600427.9 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 2.621404E-06 | global batch size: 8 | lm loss: 8.403062E-01 | loss scale: 1.0 | grad norm: 1.539 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [2024-11-30 05:59:21] iteration 254/ 500 | consumed samples: 2032 | elapsed time per iteration (ms): 520958.1 | throughput per GPU (TFLOP/s/GPU): 99.9 | learning rate: 2.605540E-06 | global batch size: 8 | lm loss: 8.314257E-01 | loss scale: 1.0 | grad norm: 0.609 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1035752, 162888] processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1045921, 146846] processed_samples 10800 unjoint_samples 10800 joint_samples 32 [315387, 1044551] processed_samples 10800 unjoint_samples 10800 joint_samples 31 [942124, 1047224] processed_samples 10800 unjoint_samples 10800 joint_samples 31 [1037875, 835556] [h264 @ 0x56215f845e80] mmco: unref short failure processed_samples 10800 unjoint_samples 10800 joint_samples 32 [449003, 1045523] [h264 @ 0x56215f845e80] mmco: unref short failure processed_samples 10800 unjoint_samples 10800 joint_samples 33 [1032065, 29516] [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1035752, 162888] processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1045921, 146846] processed_samples 10800 unjoint_samples 10800 joint_samples 31 [1037875, 835556] processed_samples 10800 unjoint_samples 10800 joint_samples 32 [449003, 1045523] processed_samples 10800 unjoint_samples 10800 joint_samples 32 [315387, 1044551] processed_samples 10800 unjoint_samples 10800 joint_samples 31 [942124, 1047224] processed_samples 10800 unjoint_samples 10800 joint_samples 33 [1032065, 29516] [2024-11-30 06:09:13] iteration 255/ 500 | consumed samples: 2040 | elapsed time per iteration (ms): 591496.3 | throughput per GPU (TFLOP/s/GPU): 88.0 | learning rate: 2.589673E-06 | global batch size: 8 | lm loss: 8.430896E-01 | loss scale: 1.0 | grad norm: 0.614 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1041514, 655987] [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 10800 unjoint_samples 10800 joint_samples 32 [1041514, 655987] [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [2024-11-30 06:16:46] iteration 256/ 500 | consumed samples: 2048 | elapsed time per iteration (ms): 453571.3 | throughput per GPU (TFLOP/s/GPU): 114.7 | learning rate: 2.573804E-06 | global batch size: 8 | lm loss: 7.856827E-01 | loss scale: 1.0 | grad norm: 0.538 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [2024-11-30 06:26:13] iteration 257/ 500 | consumed samples: 2056 | elapsed time per iteration (ms): 566571.4 | throughput per GPU (TFLOP/s/GPU): 91.9 | learning rate: 2.557935E-06 | global batch size: 8 | lm loss: 7.880665E-01 | loss scale: 1.0 | grad norm: 0.599 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [2024-11-30 06:34:21] iteration 258/ 500 | consumed samples: 2064 | elapsed time per iteration (ms): 488091.6 | throughput per GPU (TFLOP/s/GPU): 106.6 | learning rate: 2.542065E-06 | global batch size: 8 | lm loss: 8.652607E-01 | loss scale: 1.0 | grad norm: 0.579 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure processed_samples 10900 unjoint_samples 10900 joint_samples 32 [190252, 1037338] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [190252, 1037338] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1046966, 215279] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1046966, 215279] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1035752, 459328] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [722456, 1045523] processed_samples 10900 unjoint_samples 10900 joint_samples 33 [1032065, 351246] processed_samples 10900 unjoint_samples 10900 joint_samples 33 [1032065, 351246] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1035752, 459328] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1045921, 403937] processed_samples 10900 unjoint_samples 10900 joint_samples 33 [57951, 1023521] processed_samples 10900 unjoint_samples 10900 joint_samples 33 [57951, 1023521] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [1045921, 403937] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [722456, 1045523] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [575214, 1044551] processed_samples 10900 unjoint_samples 10900 joint_samples 32 [575214, 1044551] [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure processed_samples 11000 unjoint_samples 11000 joint_samples 32 [489455, 1037338] processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1046966, 552808] processed_samples 11000 unjoint_samples 11000 joint_samples 33 [381536, 1023521] processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1035752, 843715] processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1045921, 661662] processed_samples 11000 unjoint_samples 11000 joint_samples 33 [1032065, 659269] processed_samples 11000 unjoint_samples 11000 joint_samples 32 [893684, 1044551] processed_samples 11000 unjoint_samples 11000 joint_samples 33 [1044162, 75800] processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1035752, 843715] processed_samples 11000 unjoint_samples 11000 joint_samples 32 [489455, 1037338] processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1046966, 552808] processed_samples 11000 unjoint_samples 11000 joint_samples 32 [1045921, 661662] processed_samples 11000 unjoint_samples 11000 joint_samples 33 [381536, 1023521] processed_samples 11000 unjoint_samples 11000 joint_samples 33 [1032065, 659269] processed_samples 11000 unjoint_samples 11000 joint_samples 33 [1044162, 75800] processed_samples 11000 unjoint_samples 11000 joint_samples 32 [893684, 1044551] [2024-11-30 06:44:34] iteration 259/ 500 | consumed samples: 2072 | elapsed time per iteration (ms): 612328.8 | throughput per GPU (TFLOP/s/GPU): 85.0 | learning rate: 2.526196E-06 | global batch size: 8 | lm loss: 8.281539E-01 | loss scale: 1.0 | grad norm: 0.651 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [2024-11-30 06:53:14] iteration 260/ 500 | consumed samples: 2080 | elapsed time per iteration (ms): 520294.5 | throughput per GPU (TFLOP/s/GPU): 100.0 | learning rate: 2.510327E-06 | global batch size: 8 | lm loss: 8.611338E-01 | loss scale: 1.0 | grad norm: 0.656 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (266645.05, 266646.24) [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [2024-11-30 07:08:37] iteration 261/ 500 | consumed samples: 2088 | elapsed time per iteration (ms): 656543.1 | throughput per GPU (TFLOP/s/GPU): 79.3 | learning rate: 2.494460E-06 | global batch size: 8 | lm loss: 8.357545E-01 | loss scale: 1.0 | grad norm: 0.552 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 07:18:10] iteration 262/ 500 | consumed samples: 2096 | elapsed time per iteration (ms): 573062.9 | throughput per GPU (TFLOP/s/GPU): 90.8 | learning rate: 2.478596E-06 | global batch size: 8 | lm loss: 8.310040E-01 | loss scale: 1.0 | grad norm: 0.663 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 11100 unjoint_samples 11100 joint_samples 33 [90226, 1044222] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [90226, 1044222] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [35137, 1023359] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [35137, 1023359] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1045055, 96004] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1045055, 96004] processed_samples 11100 unjoint_samples 11100 joint_samples 32 [1046966, 1009349] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1044162, 432440] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1044162, 432440] processed_samples 11100 unjoint_samples 11100 joint_samples 32 [917931, 1037338] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1032065, 1004680] processed_samples 11100 unjoint_samples 11100 joint_samples 32 [917931, 1037338] processed_samples 11100 unjoint_samples 11100 joint_samples 32 [1046966, 1009349] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [1032065, 1004680] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [626501, 1023521] processed_samples 11100 unjoint_samples 11100 joint_samples 33 [626501, 1023521] [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [2024-11-30 07:27:28] iteration 263/ 500 | consumed samples: 2104 | elapsed time per iteration (ms): 558294.7 | throughput per GPU (TFLOP/s/GPU): 93.2 | learning rate: 2.462734E-06 | global batch size: 8 | lm loss: 8.269715E-01 | loss scale: 1.0 | grad norm: 0.520 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 07:35:47] iteration 264/ 500 | consumed samples: 2112 | elapsed time per iteration (ms): 498352.4 | throughput per GPU (TFLOP/s/GPU): 104.4 | learning rate: 2.446876E-06 | global batch size: 8 | lm loss: 8.242128E-01 | loss scale: 1.0 | grad norm: 0.618 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [2024-11-30 07:46:39] iteration 265/ 500 | consumed samples: 2120 | elapsed time per iteration (ms): 652257.7 | throughput per GPU (TFLOP/s/GPU): 79.8 | learning rate: 2.431023E-06 | global batch size: 8 | lm loss: 8.014107E-01 | loss scale: 1.0 | grad norm: 0.623 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure processed_samples 11200 unjoint_samples 11200 joint_samples 33 [370324, 1044222] processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1006570, 223978] processed_samples 11200 unjoint_samples 11200 joint_samples 33 [272745, 1046448] processed_samples 11200 unjoint_samples 11200 joint_samples 33 [258289, 1023359] [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1045055, 431566] processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1044162, 740665] processed_samples 11200 unjoint_samples 11200 joint_samples 33 [937926, 1023521] processed_samples 11200 unjoint_samples 11200 joint_samples 34 [316008, 1046612] [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1006570, 223978] processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1045055, 431566] processed_samples 11200 unjoint_samples 11200 joint_samples 33 [272745, 1046448] processed_samples 11200 unjoint_samples 11200 joint_samples 33 [370324, 1044222] processed_samples 11200 unjoint_samples 11200 joint_samples 33 [258289, 1023359] processed_samples 11200 unjoint_samples 11200 joint_samples 33 [1044162, 740665] processed_samples 11200 unjoint_samples 11200 joint_samples 34 [316008, 1046612] [h264 @ 0x55d51bcf3900] mmco: unref short failure processed_samples 11200 unjoint_samples 11200 joint_samples 33 [937926, 1023521] [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [2024-11-30 07:57:17] iteration 266/ 500 | consumed samples: 2128 | elapsed time per iteration (ms): 638306.7 | throughput per GPU (TFLOP/s/GPU): 81.5 | learning rate: 2.415174E-06 | global batch size: 8 | lm loss: 8.306779E-01 | loss scale: 1.0 | grad norm: 0.654 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1006570, 490094] processed_samples 11300 unjoint_samples 11300 joint_samples 33 [607037, 1044222] [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure processed_samples 11300 unjoint_samples 11300 joint_samples 33 [783103, 1046448] processed_samples 11300 unjoint_samples 11300 joint_samples 34 [1034746, 237539] processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1045055, 705299] processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1044162, 1039867] processed_samples 11300 unjoint_samples 11300 joint_samples 34 [671219, 1046612] [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure processed_samples 11300 unjoint_samples 11300 joint_samples 33 [602074, 1023359] processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1006570, 490094] processed_samples 11300 unjoint_samples 11300 joint_samples 33 [607037, 1044222] processed_samples 11300 unjoint_samples 11300 joint_samples 34 [1034746, 237539] processed_samples 11300 unjoint_samples 11300 joint_samples 33 [783103, 1046448] processed_samples 11300 unjoint_samples 11300 joint_samples 34 [671219, 1046612] [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1044162, 1039867] processed_samples 11300 unjoint_samples 11300 joint_samples 33 [602074, 1023359] processed_samples 11300 unjoint_samples 11300 joint_samples 33 [1045055, 705299] [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [2024-11-30 08:05:16] iteration 267/ 500 | consumed samples: 2136 | elapsed time per iteration (ms): 478283.8 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 2.399331E-06 | global batch size: 8 | lm loss: 8.054468E-01 | loss scale: 1.0 | grad norm: 0.702 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [2024-11-30 08:13:14] iteration 268/ 500 | consumed samples: 2144 | elapsed time per iteration (ms): 478182.6 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 2.383494E-06 | global batch size: 8 | lm loss: 7.975258E-01 | loss scale: 1.0 | grad norm: 1.340 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [2024-11-30 08:22:18] iteration 269/ 500 | consumed samples: 2152 | elapsed time per iteration (ms): 544529.5 | throughput per GPU (TFLOP/s/GPU): 95.6 | learning rate: 2.367665E-06 | global batch size: 8 | lm loss: 8.677651E-01 | loss scale: 1.0 | grad norm: 0.647 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure processed_samples 11400 unjoint_samples 11400 joint_samples 34 [93122, 1042734] processed_samples 11400 unjoint_samples 11400 joint_samples 34 [950986, 1046612] processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1045028, 137747] processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1046638, 420718] processed_samples 11400 unjoint_samples 11400 joint_samples 33 [847191, 1023359] processed_samples 11400 unjoint_samples 11400 joint_samples 33 [847191, 1023359] processed_samples 11400 unjoint_samples 11400 joint_samples 34 [93122, 1042734] processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1045028, 137747] processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1034746, 494401] processed_samples 11400 unjoint_samples 11400 joint_samples 33 [912762, 1044222] processed_samples 11400 unjoint_samples 11400 joint_samples 34 [950986, 1046612] processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1046638, 420718] processed_samples 11400 unjoint_samples 11400 joint_samples 34 [1034746, 494401] processed_samples 11400 unjoint_samples 11400 joint_samples 33 [912762, 1044222] processed_samples 11400 unjoint_samples 11400 joint_samples 33 [1006570, 814426] processed_samples 11400 unjoint_samples 11400 joint_samples 33 [1006570, 814426] [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [2024-11-30 08:30:18] iteration 270/ 500 | consumed samples: 2160 | elapsed time per iteration (ms): 479974.0 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 2.351843E-06 | global batch size: 8 | lm loss: 8.280485E-01 | loss scale: 1.0 | grad norm: 0.675 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b931500] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x55d513c38380] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [2024-11-30 08:38:59] iteration 271/ 500 | consumed samples: 2168 | elapsed time per iteration (ms): 520205.1 | throughput per GPU (TFLOP/s/GPU): 100.0 | learning rate: 2.336029E-06 | global batch size: 8 | lm loss: 8.528996E-01 | loss scale: 1.0 | grad norm: 0.622 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [2024-11-30 08:50:53] iteration 272/ 500 | consumed samples: 2176 | elapsed time per iteration (ms): 713939.2 | throughput per GPU (TFLOP/s/GPU): 72.9 | learning rate: 2.320225E-06 | global batch size: 8 | lm loss: 7.788677E-01 | loss scale: 1.0 | grad norm: 0.693 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [2024-11-30 08:58:48] iteration 273/ 500 | consumed samples: 2184 | elapsed time per iteration (ms): 475730.7 | throughput per GPU (TFLOP/s/GPU): 109.4 | learning rate: 2.304430E-06 | global batch size: 8 | lm loss: 8.115300E-01 | loss scale: 1.0 | grad norm: 0.748 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1031307, 273919] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [143094, 1032155] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1010911, 127257] processed_samples 11500 unjoint_samples 11500 joint_samples 35 [178384, 1046612] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1045028, 395499] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [143094, 1032155] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1034746, 745511] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1031307, 273919] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1046638, 666786] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [400243, 1042734] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1010911, 127257] processed_samples 11500 unjoint_samples 11500 joint_samples 35 [178384, 1046612] [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1045028, 395499] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [400243, 1042734] processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1046638, 666786] [h264 @ 0x55d5141f5f40] mmco: unref short failure processed_samples 11500 unjoint_samples 11500 joint_samples 34 [1034746, 745511] [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [2024-11-30 09:08:50] iteration 274/ 500 | consumed samples: 2192 | elapsed time per iteration (ms): 601265.3 | throughput per GPU (TFLOP/s/GPU): 86.6 | learning rate: 2.288645E-06 | global batch size: 8 | lm loss: 7.966631E-01 | loss scale: 1.0 | grad norm: 0.664 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 11600 unjoint_samples 11600 joint_samples 34 [464788, 1032155] processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1031307, 748776] processed_samples 11600 unjoint_samples 11600 joint_samples 34 [464788, 1032155] processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1010911, 433321] processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1031307, 748776] [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1045028, 868530] processed_samples 11600 unjoint_samples 11600 joint_samples 35 [84381, 1028799] processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1046638, 920461] processed_samples 11600 unjoint_samples 11600 joint_samples 35 [452627, 1046612] processed_samples 11600 unjoint_samples 11600 joint_samples 35 [84381, 1028799] processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1010911, 433321] processed_samples 11600 unjoint_samples 11600 joint_samples 34 [717191, 1042734] [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1045028, 868530] processed_samples 11600 unjoint_samples 11600 joint_samples 35 [452627, 1046612] processed_samples 11600 unjoint_samples 11600 joint_samples 34 [717191, 1042734] [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 11600 unjoint_samples 11600 joint_samples 34 [1046638, 920461] [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b928840] mmco: unref short failure [h264 @ 0x56215b928840] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [2024-11-30 09:16:09] iteration 275/ 500 | consumed samples: 2200 | elapsed time per iteration (ms): 439726.4 | throughput per GPU (TFLOP/s/GPU): 118.3 | learning rate: 2.272871E-06 | global batch size: 8 | lm loss: 8.427922E-01 | loss scale: 1.0 | grad norm: 0.629 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 09:24:45] iteration 276/ 500 | consumed samples: 2208 | elapsed time per iteration (ms): 515239.6 | throughput per GPU (TFLOP/s/GPU): 101.0 | learning rate: 2.257109E-06 | global batch size: 8 | lm loss: 8.416483E-01 | loss scale: 1.0 | grad norm: 0.634 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215d5b6680] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [2024-11-30 09:36:07] iteration 277/ 500 | consumed samples: 2216 | elapsed time per iteration (ms): 682197.4 | throughput per GPU (TFLOP/s/GPU): 76.3 | learning rate: 2.241359E-06 | global batch size: 8 | lm loss: 7.881758E-01 | loss scale: 1.0 | grad norm: 0.592 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 11700 unjoint_samples 11700 joint_samples 34 [1043707, 1043534] processed_samples 11700 unjoint_samples 11700 joint_samples 35 [284268, 1010837] processed_samples 11700 unjoint_samples 11700 joint_samples 34 [762107, 1032155] processed_samples 11700 unjoint_samples 11700 joint_samples 35 [1046638, 200875] processed_samples 11700 unjoint_samples 11700 joint_samples 35 [1043849, 56536] processed_samples 11700 unjoint_samples 11700 joint_samples 34 [1010911, 863024] [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure processed_samples 11700 unjoint_samples 11700 joint_samples 34 [1010911, 863024] processed_samples 11700 unjoint_samples 11700 joint_samples 34 [762107, 1032155] processed_samples 11700 unjoint_samples 11700 joint_samples 35 [1043849, 56536] processed_samples 11700 unjoint_samples 11700 joint_samples 35 [284268, 1010837] processed_samples 11700 unjoint_samples 11700 joint_samples 35 [324394, 1028799] processed_samples 11700 unjoint_samples 11700 joint_samples 35 [1046638, 200875] processed_samples 11700 unjoint_samples 11700 joint_samples 34 [1043707, 1043534] [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure processed_samples 11700 unjoint_samples 11700 joint_samples 35 [757912, 1046612] processed_samples 11700 unjoint_samples 11700 joint_samples 35 [324394, 1028799] processed_samples 11700 unjoint_samples 11700 joint_samples 35 [757912, 1046612] [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [2024-11-30 09:46:41] iteration 278/ 500 | consumed samples: 2224 | elapsed time per iteration (ms): 634027.2 | throughput per GPU (TFLOP/s/GPU): 82.1 | learning rate: 2.225622E-06 | global batch size: 8 | lm loss: 8.349802E-01 | loss scale: 1.0 | grad norm: 0.896 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d516b0c1c0] mmco: unref short failure [h264 @ 0x55d516b0c1c0] mmco: unref short failure [2024-11-30 09:55:01] iteration 279/ 500 | consumed samples: 2232 | elapsed time per iteration (ms): 500105.8 | throughput per GPU (TFLOP/s/GPU): 104.1 | learning rate: 2.209899E-06 | global batch size: 8 | lm loss: 7.836739E-01 | loss scale: 1.0 | grad norm: 0.856 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 10:05:10] iteration 280/ 500 | consumed samples: 2240 | elapsed time per iteration (ms): 609166.8 | throughput per GPU (TFLOP/s/GPU): 85.4 | learning rate: 2.194190E-06 | global batch size: 8 | lm loss: 8.044785E-01 | loss scale: 1.0 | grad norm: 1.376 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (171105.91, 171106.25) [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215c030640] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [2024-11-30 10:16:11] iteration 281/ 500 | consumed samples: 2248 | elapsed time per iteration (ms): 489721.5 | throughput per GPU (TFLOP/s/GPU): 106.3 | learning rate: 2.178496E-06 | global batch size: 8 | lm loss: 8.157408E-01 | loss scale: 1.0 | grad norm: 0.658 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure processed_samples 11800 unjoint_samples 11800 joint_samples 35 [52957, 1044877] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1043849, 366158] [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 11800 unjoint_samples 11800 joint_samples 35 [581835, 1028799] [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1047582, 226100] processed_samples 11800 unjoint_samples 11800 joint_samples 36 [1035315, 165408] processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1043425, 136197] processed_samples 11800 unjoint_samples 11800 joint_samples 35 [650628, 1010837] processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1046638, 508637] [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure processed_samples 11800 unjoint_samples 11800 joint_samples 35 [52957, 1044877] processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1043849, 366158] processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1046638, 508637] processed_samples 11800 unjoint_samples 11800 joint_samples 35 [581835, 1028799] [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure processed_samples 11800 unjoint_samples 11800 joint_samples 36 [1035315, 165408] processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1047582, 226100] processed_samples 11800 unjoint_samples 11800 joint_samples 35 [1043425, 136197] processed_samples 11800 unjoint_samples 11800 joint_samples 35 [650628, 1010837] [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure processed_samples 11900 unjoint_samples 11900 joint_samples 35 [353851, 1044877] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1043849, 681942] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [859390, 1028799] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1047582, 516773] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [960099, 1010837] processed_samples 11900 unjoint_samples 11900 joint_samples 36 [1035315, 604488] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1043425, 503852] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1046638, 860170] [h264 @ 0x55d514afd580] mmco: unref short failure processed_samples 11900 unjoint_samples 11900 joint_samples 35 [353851, 1044877] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1043425, 503852] processed_samples 11900 unjoint_samples 11900 joint_samples 36 [1035315, 604488] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1047582, 516773] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1043849, 681942] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [859390, 1028799] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [960099, 1010837] processed_samples 11900 unjoint_samples 11900 joint_samples 35 [1046638, 860170] [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [2024-11-30 10:27:34] iteration 282/ 500 | consumed samples: 2256 | elapsed time per iteration (ms): 683305.7 | throughput per GPU (TFLOP/s/GPU): 76.2 | learning rate: 2.162817E-06 | global batch size: 8 | lm loss: 7.566515E-01 | loss scale: 1.0 | grad norm: 0.545 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x55d513632740] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [2024-11-30 10:35:49] iteration 283/ 500 | consumed samples: 2264 | elapsed time per iteration (ms): 495043.0 | throughput per GPU (TFLOP/s/GPU): 105.1 | learning rate: 2.147155E-06 | global batch size: 8 | lm loss: 8.410187E-01 | loss scale: 1.0 | grad norm: 0.790 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [2024-11-30 10:46:45] iteration 284/ 500 | consumed samples: 2272 | elapsed time per iteration (ms): 655690.9 | throughput per GPU (TFLOP/s/GPU): 79.4 | learning rate: 2.131510E-06 | global batch size: 8 | lm loss: 7.616670E-01 | loss scale: 1.0 | grad norm: 0.654 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure processed_samples 12000 unjoint_samples 12000 joint_samples 36 [242872, 1046135] processed_samples 12000 unjoint_samples 12000 joint_samples 36 [1003444, 360813] [h264 @ 0x562163486b40] mmco: unref short failure processed_samples 12000 unjoint_samples 12000 joint_samples 36 [236351, 1037505] processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1043849, 983242] processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1047582, 865435] processed_samples 12000 unjoint_samples 12000 joint_samples 36 [1035315, 978031] processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1043425, 801378] [h264 @ 0x55d51713dd00] mmco: unref short failure processed_samples 12000 unjoint_samples 12000 joint_samples 35 [796309, 1044877] processed_samples 12000 unjoint_samples 12000 joint_samples 36 [1003444, 360813] processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1043425, 801378] processed_samples 12000 unjoint_samples 12000 joint_samples 36 [236351, 1037505] processed_samples 12000 unjoint_samples 12000 joint_samples 36 [242872, 1046135] processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1043849, 983242] processed_samples 12000 unjoint_samples 12000 joint_samples 35 [796309, 1044877] [h264 @ 0x55d51376ab40] mmco: unref short failure processed_samples 12000 unjoint_samples 12000 joint_samples 35 [1047582, 865435] [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure processed_samples 12000 unjoint_samples 12000 joint_samples 36 [1035315, 978031] [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [2024-11-30 10:57:08] iteration 285/ 500 | consumed samples: 2280 | elapsed time per iteration (ms): 623028.4 | throughput per GPU (TFLOP/s/GPU): 83.5 | learning rate: 2.115882E-06 | global batch size: 8 | lm loss: 8.154826E-01 | loss scale: 1.0 | grad norm: 0.667 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [2024-11-30 11:06:58] iteration 286/ 500 | consumed samples: 2288 | elapsed time per iteration (ms): 589484.3 | throughput per GPU (TFLOP/s/GPU): 88.3 | learning rate: 2.100273E-06 | global batch size: 8 | lm loss: 8.646793E-01 | loss scale: 1.0 | grad norm: 0.998 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215ef24280] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [2024-11-30 11:15:14] iteration 287/ 500 | consumed samples: 2296 | elapsed time per iteration (ms): 496441.8 | throughput per GPU (TFLOP/s/GPU): 104.8 | learning rate: 2.084682E-06 | global batch size: 8 | lm loss: 7.899673E-01 | loss scale: 1.0 | grad norm: 0.709 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 11:23:49] iteration 288/ 500 | consumed samples: 2304 | elapsed time per iteration (ms): 515215.7 | throughput per GPU (TFLOP/s/GPU): 101.0 | learning rate: 2.069111E-06 | global batch size: 8 | lm loss: 8.179791E-01 | loss scale: 1.0 | grad norm: 0.600 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 11:33:46] iteration 289/ 500 | consumed samples: 2312 | elapsed time per iteration (ms): 597082.0 | throughput per GPU (TFLOP/s/GPU): 87.2 | learning rate: 2.053560E-06 | global batch size: 8 | lm loss: 8.552551E-01 | loss scale: 1.0 | grad norm: 0.682 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure processed_samples 12100 unjoint_samples 12100 joint_samples 36 [993547, 135709] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [145136, 1046501] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1048014, 119398] processed_samples 12100 unjoint_samples 12100 joint_samples 37 [196550, 1047132] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1043849, 351598] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [532513, 1037505] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [613456, 1046135] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1003444, 680620] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure processed_samples 12100 unjoint_samples 12100 joint_samples 36 [145136, 1046501] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [993547, 135709] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1048014, 119398] processed_samples 12100 unjoint_samples 12100 joint_samples 37 [196550, 1047132] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1043849, 351598] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [532513, 1037505] processed_samples 12100 unjoint_samples 12100 joint_samples 36 [1003444, 680620] [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure processed_samples 12100 unjoint_samples 12100 joint_samples 36 [613456, 1046135] [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x55d516ad1940] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure processed_samples 12200 unjoint_samples 12200 joint_samples 36 [993547, 473142] processed_samples 12200 unjoint_samples 12200 joint_samples 37 [504885, 1047132] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1048014, 333278] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [804067, 1037505] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [433142, 1046501] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1003444, 914513] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1043849, 693272] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [931466, 1046135] [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure processed_samples 12200 unjoint_samples 12200 joint_samples 37 [504885, 1047132] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [993547, 473142] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1043849, 693272] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [433142, 1046501] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1048014, 333278] [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure processed_samples 12200 unjoint_samples 12200 joint_samples 36 [1003444, 914513] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [804067, 1037505] processed_samples 12200 unjoint_samples 12200 joint_samples 36 [931466, 1046135] [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [2024-11-30 11:46:13] iteration 290/ 500 | consumed samples: 2320 | elapsed time per iteration (ms): 746951.5 | throughput per GPU (TFLOP/s/GPU): 69.7 | learning rate: 2.038029E-06 | global batch size: 8 | lm loss: 8.304491E-01 | loss scale: 1.0 | grad norm: 0.762 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 11:54:12] iteration 291/ 500 | consumed samples: 2328 | elapsed time per iteration (ms): 478261.6 | throughput per GPU (TFLOP/s/GPU): 108.8 | learning rate: 2.022521E-06 | global batch size: 8 | lm loss: 7.944888E-01 | loss scale: 1.0 | grad norm: 0.555 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 12:03:11] iteration 292/ 500 | consumed samples: 2336 | elapsed time per iteration (ms): 539350.2 | throughput per GPU (TFLOP/s/GPU): 96.5 | learning rate: 2.007034E-06 | global batch size: 8 | lm loss: 7.973379E-01 | loss scale: 1.0 | grad norm: 1.126 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure processed_samples 12300 unjoint_samples 12300 joint_samples 36 [1043849, 990824] processed_samples 12300 unjoint_samples 12300 joint_samples 37 [251464, 1046430] processed_samples 12300 unjoint_samples 12300 joint_samples 36 [688565, 1046501] processed_samples 12300 unjoint_samples 12300 joint_samples 36 [1048014, 615495] processed_samples 12300 unjoint_samples 12300 joint_samples 37 [1046797, 218884] processed_samples 12300 unjoint_samples 12300 joint_samples 36 [993547, 791688] processed_samples 12300 unjoint_samples 12300 joint_samples 37 [251464, 1046430] processed_samples 12300 unjoint_samples 12300 joint_samples 36 [1043849, 990824] processed_samples 12300 unjoint_samples 12300 joint_samples 37 [1046831, 299400] processed_samples 12300 unjoint_samples 12300 joint_samples 36 [993547, 791688] processed_samples 12300 unjoint_samples 12300 joint_samples 36 [688565, 1046501] processed_samples 12300 unjoint_samples 12300 joint_samples 37 [1046797, 218884] processed_samples 12300 unjoint_samples 12300 joint_samples 36 [1048014, 615495] processed_samples 12300 unjoint_samples 12300 joint_samples 37 [1046831, 299400] processed_samples 12300 unjoint_samples 12300 joint_samples 37 [983928, 1047132] processed_samples 12300 unjoint_samples 12300 joint_samples 37 [983928, 1047132] [2024-11-30 12:12:44] iteration 293/ 500 | consumed samples: 2344 | elapsed time per iteration (ms): 573404.3 | throughput per GPU (TFLOP/s/GPU): 90.8 | learning rate: 1.991570E-06 | global batch size: 8 | lm loss: 8.226759E-01 | loss scale: 1.0 | grad norm: 0.719 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 12:23:57] iteration 294/ 500 | consumed samples: 2352 | elapsed time per iteration (ms): 672631.5 | throughput per GPU (TFLOP/s/GPU): 77.4 | learning rate: 1.976130E-06 | global batch size: 8 | lm loss: 8.111385E-01 | loss scale: 1.0 | grad norm: 0.669 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [2024-11-30 12:33:13] iteration 295/ 500 | consumed samples: 2360 | elapsed time per iteration (ms): 555682.2 | throughput per GPU (TFLOP/s/GPU): 93.7 | learning rate: 1.960714E-06 | global batch size: 8 | lm loss: 8.239835E-01 | loss scale: 1.0 | grad norm: 0.590 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure processed_samples 12400 unjoint_samples 12400 joint_samples 37 [354583, 1031883] processed_samples 12400 unjoint_samples 12400 joint_samples 37 [106069, 1043662] processed_samples 12400 unjoint_samples 12400 joint_samples 37 [565622, 1046430] processed_samples 12400 unjoint_samples 12400 joint_samples 37 [1046797, 502036] processed_samples 12400 unjoint_samples 12400 joint_samples 38 [1008915, 359357] processed_samples 12400 unjoint_samples 12400 joint_samples 37 [1046831, 588798] processed_samples 12400 unjoint_samples 12400 joint_samples 36 [1026683, 1026981] processed_samples 12400 unjoint_samples 12400 joint_samples 36 [939259, 1046501] [h264 @ 0x55d517608ac0] mmco: unref short failure processed_samples 12400 unjoint_samples 12400 joint_samples 37 [354583, 1031883] processed_samples 12400 unjoint_samples 12400 joint_samples 37 [106069, 1043662] processed_samples 12400 unjoint_samples 12400 joint_samples 37 [565622, 1046430] processed_samples 12400 unjoint_samples 12400 joint_samples 37 [1046797, 502036] processed_samples 12400 unjoint_samples 12400 joint_samples 37 [1046831, 588798] processed_samples 12400 unjoint_samples 12400 joint_samples 38 [1008915, 359357] processed_samples 12400 unjoint_samples 12400 joint_samples 36 [1026683, 1026981] [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure processed_samples 12400 unjoint_samples 12400 joint_samples 36 [939259, 1046501] [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [2024-11-30 12:41:33] iteration 296/ 500 | consumed samples: 2368 | elapsed time per iteration (ms): 499937.2 | throughput per GPU (TFLOP/s/GPU): 104.1 | learning rate: 1.945322E-06 | global batch size: 8 | lm loss: 8.253292E-01 | loss scale: 1.0 | grad norm: 0.627 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [2024-11-30 12:54:29] iteration 297/ 500 | consumed samples: 2376 | elapsed time per iteration (ms): 776033.1 | throughput per GPU (TFLOP/s/GPU): 67.1 | learning rate: 1.929956E-06 | global batch size: 8 | lm loss: 8.470016E-01 | loss scale: 1.0 | grad norm: 0.718 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1040955, 213455] processed_samples 12500 unjoint_samples 12500 joint_samples 37 [392874, 1043662] processed_samples 12500 unjoint_samples 12500 joint_samples 37 [706780, 1031883] [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1046526, 269576] processed_samples 12500 unjoint_samples 12500 joint_samples 37 [943416, 1046430] [h264 @ 0x55d51368aac0] mmco: unref short failure processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1046797, 834976] processed_samples 12500 unjoint_samples 12500 joint_samples 38 [1008915, 805986] [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure processed_samples 12500 unjoint_samples 12500 joint_samples 38 [24850, 1043320] [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1046526, 269576] processed_samples 12500 unjoint_samples 12500 joint_samples 38 [1008915, 805986] processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1040955, 213455] processed_samples 12500 unjoint_samples 12500 joint_samples 37 [706780, 1031883] processed_samples 12500 unjoint_samples 12500 joint_samples 37 [392874, 1043662] processed_samples 12500 unjoint_samples 12500 joint_samples 37 [1046797, 834976] [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure processed_samples 12500 unjoint_samples 12500 joint_samples 37 [943416, 1046430] processed_samples 12500 unjoint_samples 12500 joint_samples 38 [24850, 1043320] [2024-11-30 13:02:29] iteration 298/ 500 | consumed samples: 2384 | elapsed time per iteration (ms): 480085.5 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 1.914616E-06 | global batch size: 8 | lm loss: 7.771384E-01 | loss scale: 1.0 | grad norm: 0.622 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [2024-11-30 13:12:05] iteration 299/ 500 | consumed samples: 2392 | elapsed time per iteration (ms): 575897.8 | throughput per GPU (TFLOP/s/GPU): 90.4 | learning rate: 1.899303E-06 | global batch size: 8 | lm loss: 8.349781E-01 | loss scale: 1.0 | grad norm: 0.591 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x56215c3e4200] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [2024-11-30 13:19:46] iteration 300/ 500 | consumed samples: 2400 | elapsed time per iteration (ms): 461560.0 | throughput per GPU (TFLOP/s/GPU): 112.7 | learning rate: 1.884016E-06 | global batch size: 8 | lm loss: 8.001071E-01 | loss scale: 1.0 | grad norm: 0.654 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (282566.86, 282567.81) [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [2024-11-30 13:33:37] iteration 301/ 500 | consumed samples: 2408 | elapsed time per iteration (ms): 547954.3 | throughput per GPU (TFLOP/s/GPU): 95.0 | learning rate: 1.868758E-06 | global batch size: 8 | lm loss: 8.713289E-01 | loss scale: 1.0 | grad norm: 0.673 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 12600 unjoint_samples 12600 joint_samples 39 [17506, 1046889] processed_samples 12600 unjoint_samples 12600 joint_samples 39 [17506, 1046889] processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1028954, 1031883] processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1028954, 1031883] processed_samples 12600 unjoint_samples 12600 joint_samples 38 [141516, 1046478] processed_samples 12600 unjoint_samples 12600 joint_samples 38 [141516, 1046478] processed_samples 12600 unjoint_samples 12600 joint_samples 38 [1035194, 366712] processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1040955, 464830] processed_samples 12600 unjoint_samples 12600 joint_samples 38 [1035194, 366712] processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1040955, 464830] processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1046526, 539818] processed_samples 12600 unjoint_samples 12600 joint_samples 37 [1046526, 539818] processed_samples 12600 unjoint_samples 12600 joint_samples 38 [280121, 1043320] processed_samples 12600 unjoint_samples 12600 joint_samples 37 [641125, 1043662] processed_samples 12600 unjoint_samples 12600 joint_samples 38 [280121, 1043320] processed_samples 12600 unjoint_samples 12600 joint_samples 37 [641125, 1043662] [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [2024-11-30 13:46:17] iteration 302/ 500 | consumed samples: 2416 | elapsed time per iteration (ms): 760100.8 | throughput per GPU (TFLOP/s/GPU): 68.5 | learning rate: 1.853528E-06 | global batch size: 8 | lm loss: 8.170073E-01 | loss scale: 1.0 | grad norm: 0.612 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d51327b000] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure processed_samples 12700 unjoint_samples 12700 joint_samples 38 [1045818, 284715] processed_samples 12700 unjoint_samples 12700 joint_samples 38 [1035194, 660677] [h264 @ 0x56215ba5b0c0] mmco: unref short failure processed_samples 12700 unjoint_samples 12700 joint_samples 39 [293695, 1046889] processed_samples 12700 unjoint_samples 12700 joint_samples 37 [1046526, 893046] processed_samples 12700 unjoint_samples 12700 joint_samples 38 [494410, 1043320] processed_samples 12700 unjoint_samples 12700 joint_samples 37 [1040955, 855781] processed_samples 12700 unjoint_samples 12700 joint_samples 38 [562324, 1046478] processed_samples 12700 unjoint_samples 12700 joint_samples 37 [949756, 1043662] [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d517c83800] mmco: unref short failure [h264 @ 0x55d517c83800] mmco: unref short failure [h264 @ 0x55d517c83800] mmco: unref short failure [h264 @ 0x55d517c83800] mmco: unref short failure processed_samples 12700 unjoint_samples 12700 joint_samples 38 [1045818, 284715] [h264 @ 0x55d5145449c0] mmco: unref short failure processed_samples 12700 unjoint_samples 12700 joint_samples 39 [293695, 1046889] processed_samples 12700 unjoint_samples 12700 joint_samples 37 [1046526, 893046] processed_samples 12700 unjoint_samples 12700 joint_samples 37 [1040955, 855781] processed_samples 12700 unjoint_samples 12700 joint_samples 38 [1035194, 660677] processed_samples 12700 unjoint_samples 12700 joint_samples 38 [494410, 1043320] processed_samples 12700 unjoint_samples 12700 joint_samples 38 [562324, 1046478] [h264 @ 0x55d517c83800] mmco: unref short failure processed_samples 12700 unjoint_samples 12700 joint_samples 37 [949756, 1043662] [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d517c83800] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [2024-11-30 13:56:06] iteration 303/ 500 | consumed samples: 2424 | elapsed time per iteration (ms): 588726.4 | throughput per GPU (TFLOP/s/GPU): 88.4 | learning rate: 1.838328E-06 | global batch size: 8 | lm loss: 8.639438E-01 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517c83800] mmco: unref short failure [h264 @ 0x55d517c83800] mmco: unref short failure [h264 @ 0x55d517c83800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [2024-11-30 14:05:05] iteration 304/ 500 | consumed samples: 2432 | elapsed time per iteration (ms): 538865.4 | throughput per GPU (TFLOP/s/GPU): 96.6 | learning rate: 1.823157E-06 | global batch size: 8 | lm loss: 8.561846E-01 | loss scale: 1.0 | grad norm: 0.625 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d517c83800] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [2024-11-30 14:13:55] iteration 305/ 500 | consumed samples: 2440 | elapsed time per iteration (ms): 530809.8 | throughput per GPU (TFLOP/s/GPU): 98.0 | learning rate: 1.808017E-06 | global batch size: 8 | lm loss: 7.760535E-01 | loss scale: 1.0 | grad norm: 0.571 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1046526, 139839] [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1045818, 554980] processed_samples 12800 unjoint_samples 12800 joint_samples 38 [857097, 1046478] processed_samples 12800 unjoint_samples 12800 joint_samples 38 [219037, 1045808] processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1043268, 112077] [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure processed_samples 12800 unjoint_samples 12800 joint_samples 39 [560760, 1046889] processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1035194, 898744] [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure processed_samples 12800 unjoint_samples 12800 joint_samples 38 [778423, 1043320] [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1046526, 139839] processed_samples 12800 unjoint_samples 12800 joint_samples 38 [857097, 1046478] processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1045818, 554980] [h264 @ 0x56215bb50880] mmco: unref short failure processed_samples 12800 unjoint_samples 12800 joint_samples 38 [219037, 1045808] [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1043268, 112077] processed_samples 12800 unjoint_samples 12800 joint_samples 39 [560760, 1046889] processed_samples 12800 unjoint_samples 12800 joint_samples 38 [778423, 1043320] processed_samples 12800 unjoint_samples 12800 joint_samples 38 [1035194, 898744] [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d517c83800] mmco: unref short failure [h264 @ 0x55d517c83800] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51327b000] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [2024-11-30 14:23:15] iteration 306/ 500 | consumed samples: 2448 | elapsed time per iteration (ms): 559066.7 | throughput per GPU (TFLOP/s/GPU): 93.1 | learning rate: 1.792908E-06 | global batch size: 8 | lm loss: 7.674619E-01 | loss scale: 1.0 | grad norm: 0.564 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d512f0e280] mmco: unref short failure [h264 @ 0x55d512f0e280] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [2024-11-30 14:31:20] iteration 307/ 500 | consumed samples: 2456 | elapsed time per iteration (ms): 485233.6 | throughput per GPU (TFLOP/s/GPU): 107.2 | learning rate: 1.777831E-06 | global batch size: 8 | lm loss: 8.237488E-01 | loss scale: 1.0 | grad norm: 0.547 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d512f0e280] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d512f0e280] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d512f0e280] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure processed_samples 12900 unjoint_samples 12900 joint_samples 39 [1046397, 85695] processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1046526, 528940] processed_samples 12900 unjoint_samples 12900 joint_samples 38 [467957, 1045808] processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1043268, 511206] processed_samples 12900 unjoint_samples 12900 joint_samples 39 [1046275, 62598] processed_samples 12900 unjoint_samples 12900 joint_samples 39 [838773, 1046889] processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1045818, 1002455] processed_samples 12900 unjoint_samples 12900 joint_samples 39 [1046397, 85695] processed_samples 12900 unjoint_samples 12900 joint_samples 38 [467957, 1045808] processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1046526, 528940] processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1043268, 511206] processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1021311, 1043320] processed_samples 12900 unjoint_samples 12900 joint_samples 39 [1046275, 62598] processed_samples 12900 unjoint_samples 12900 joint_samples 39 [838773, 1046889] processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1045818, 1002455] processed_samples 12900 unjoint_samples 12900 joint_samples 38 [1021311, 1043320] [h264 @ 0x55d513738040] mmco: unref short failure [h264 @ 0x55d513738040] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [2024-11-30 14:40:57] iteration 308/ 500 | consumed samples: 2464 | elapsed time per iteration (ms): 577346.8 | throughput per GPU (TFLOP/s/GPU): 90.1 | learning rate: 1.762786E-06 | global batch size: 8 | lm loss: 7.823042E-01 | loss scale: 1.0 | grad norm: 0.562 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513738040] mmco: unref short failure [h264 @ 0x55d513738040] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513738040] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513738040] mmco: unref short failure [h264 @ 0x55d513738040] mmco: unref short failure [2024-11-30 14:50:22] iteration 309/ 500 | consumed samples: 2472 | elapsed time per iteration (ms): 564389.6 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 1.747775E-06 | global batch size: 8 | lm loss: 8.073215E-01 | loss scale: 1.0 | grad norm: 0.614 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513738040] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d512f0e280] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d512f0e280] mmco: unref short failure [h264 @ 0x55d512f0e280] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [2024-11-30 15:01:02] iteration 310/ 500 | consumed samples: 2480 | elapsed time per iteration (ms): 640592.5 | throughput per GPU (TFLOP/s/GPU): 81.2 | learning rate: 1.732797E-06 | global batch size: 8 | lm loss: 7.878639E-01 | loss scale: 1.0 | grad norm: 0.630 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d512f0e280] mmco: unref short failure [h264 @ 0x55d512f0e280] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215d2fbb40] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure processed_samples 13000 unjoint_samples 13000 joint_samples 39 [1046275, 369384] processed_samples 13000 unjoint_samples 13000 joint_samples 38 [956646, 1045808] processed_samples 13000 unjoint_samples 13000 joint_samples 39 [240077, 1047349] processed_samples 13000 unjoint_samples 13000 joint_samples 39 [1046397, 334750] processed_samples 13000 unjoint_samples 13000 joint_samples 40 [1045623, 91370] processed_samples 13000 unjoint_samples 13000 joint_samples 39 [223695, 1044019] processed_samples 13000 unjoint_samples 13000 joint_samples 38 [1046526, 814297] processed_samples 13000 unjoint_samples 13000 joint_samples 38 [1043268, 872584] [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure processed_samples 13000 unjoint_samples 13000 joint_samples 39 [1046275, 369384] processed_samples 13000 unjoint_samples 13000 joint_samples 40 [1045623, 91370] processed_samples 13000 unjoint_samples 13000 joint_samples 39 [240077, 1047349] processed_samples 13000 unjoint_samples 13000 joint_samples 39 [1046397, 334750] processed_samples 13000 unjoint_samples 13000 joint_samples 39 [223695, 1044019] processed_samples 13000 unjoint_samples 13000 joint_samples 38 [1046526, 814297] processed_samples 13000 unjoint_samples 13000 joint_samples 38 [956646, 1045808] [h264 @ 0x55d516907ac0] mmco: unref short failure processed_samples 13000 unjoint_samples 13000 joint_samples 38 [1043268, 872584] [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215d2fbb40] mmco: unref short failure [h264 @ 0x56215d2fbb40] mmco: unref short failure [h264 @ 0x56215c1c3180] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [2024-11-30 15:10:17] iteration 311/ 500 | consumed samples: 2488 | elapsed time per iteration (ms): 554512.4 | throughput per GPU (TFLOP/s/GPU): 93.8 | learning rate: 1.717853E-06 | global batch size: 8 | lm loss: 7.991486E-01 | loss scale: 1.0 | grad norm: 0.605 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [2024-11-30 15:19:24] iteration 312/ 500 | consumed samples: 2496 | elapsed time per iteration (ms): 546881.2 | throughput per GPU (TFLOP/s/GPU): 95.2 | learning rate: 1.702944E-06 | global batch size: 8 | lm loss: 8.143560E-01 | loss scale: 1.0 | grad norm: 0.614 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [2024-11-30 15:28:58] iteration 313/ 500 | consumed samples: 2504 | elapsed time per iteration (ms): 574661.5 | throughput per GPU (TFLOP/s/GPU): 90.6 | learning rate: 1.688070E-06 | global batch size: 8 | lm loss: 8.001103E-01 | loss scale: 1.0 | grad norm: 0.603 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure processed_samples 13100 unjoint_samples 13100 joint_samples 39 [103466, 1024651] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046275, 709019] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [93796, 1029821] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [571757, 1047349] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [103466, 1024651] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [609128, 1044019] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046397, 678441] processed_samples 13100 unjoint_samples 13100 joint_samples 40 [1045623, 394135] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046417, 212551] processed_samples 13100 unjoint_samples 13100 joint_samples 40 [1045623, 394135] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [93796, 1029821] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [571757, 1047349] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046397, 678441] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046275, 709019] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [609128, 1044019] processed_samples 13100 unjoint_samples 13100 joint_samples 39 [1046417, 212551] [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56216223ed40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d514dfd500] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c1c3180] mmco: unref short failure [h264 @ 0x56215c1c3180] mmco: unref short failure [h264 @ 0x56215c1c3180] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 13200 unjoint_samples 13200 joint_samples 39 [410289, 1024651] [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure processed_samples 13200 unjoint_samples 13200 joint_samples 39 [453053, 1029821] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [909462, 1044019] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [825074, 1047349] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046417, 469797] processed_samples 13200 unjoint_samples 13200 joint_samples 40 [1045623, 665843] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046275, 1020331] [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046397, 931380] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [410289, 1024651] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046397, 931380] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [909462, 1044019] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [453053, 1029821] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046417, 469797] processed_samples 13200 unjoint_samples 13200 joint_samples 40 [1045623, 665843] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [825074, 1047349] processed_samples 13200 unjoint_samples 13200 joint_samples 39 [1046275, 1020331] [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-11-30 15:40:10] iteration 314/ 500 | consumed samples: 2512 | elapsed time per iteration (ms): 671337.2 | throughput per GPU (TFLOP/s/GPU): 77.5 | learning rate: 1.673233E-06 | global batch size: 8 | lm loss: 8.582259E-01 | loss scale: 1.0 | grad norm: 0.582 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [2024-11-30 15:49:06] iteration 315/ 500 | consumed samples: 2520 | elapsed time per iteration (ms): 536085.1 | throughput per GPU (TFLOP/s/GPU): 97.1 | learning rate: 1.658433E-06 | global batch size: 8 | lm loss: 8.089479E-01 | loss scale: 1.0 | grad norm: 0.701 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [2024-11-30 15:58:40] iteration 316/ 500 | consumed samples: 2528 | elapsed time per iteration (ms): 574309.7 | throughput per GPU (TFLOP/s/GPU): 90.6 | learning rate: 1.643670E-06 | global batch size: 8 | lm loss: 8.455462E-01 | loss scale: 1.0 | grad norm: 0.599 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [2024-11-30 16:08:19] iteration 317/ 500 | consumed samples: 2536 | elapsed time per iteration (ms): 578986.6 | throughput per GPU (TFLOP/s/GPU): 89.9 | learning rate: 1.628945E-06 | global batch size: 8 | lm loss: 8.240018E-01 | loss scale: 1.0 | grad norm: 0.727 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [2024-11-30 16:23:11] iteration 318/ 500 | consumed samples: 2544 | elapsed time per iteration (ms): 891890.1 | throughput per GPU (TFLOP/s/GPU): 58.3 | learning rate: 1.614259E-06 | global batch size: 8 | lm loss: 8.104389E-01 | loss scale: 1.0 | grad norm: 0.606 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1028881, 517520] processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1046397, 230256] processed_samples 13300 unjoint_samples 13300 joint_samples 40 [63461, 1047349] processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1046275, 280185] [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure processed_samples 13300 unjoint_samples 13300 joint_samples 41 [399929, 835418] processed_samples 13300 unjoint_samples 13300 joint_samples 39 [1046417, 860811] processed_samples 13300 unjoint_samples 13300 joint_samples 39 [744636, 1029821] [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure processed_samples 13300 unjoint_samples 13300 joint_samples 39 [794114, 1024651] [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1046397, 230256] processed_samples 13300 unjoint_samples 13300 joint_samples 40 [63461, 1047349] processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1046275, 280185] processed_samples 13300 unjoint_samples 13300 joint_samples 40 [1028881, 517520] [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure processed_samples 13300 unjoint_samples 13300 joint_samples 41 [399929, 835418] [h264 @ 0x56215b75cc40] mmco: unref short failure processed_samples 13300 unjoint_samples 13300 joint_samples 39 [1046417, 860811] processed_samples 13300 unjoint_samples 13300 joint_samples 39 [744636, 1029821] processed_samples 13300 unjoint_samples 13300 joint_samples 39 [794114, 1024651] [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [2024-11-30 16:32:13] iteration 319/ 500 | consumed samples: 2552 | elapsed time per iteration (ms): 542489.5 | throughput per GPU (TFLOP/s/GPU): 95.9 | learning rate: 1.599612E-06 | global batch size: 8 | lm loss: 8.013833E-01 | loss scale: 1.0 | grad norm: 0.677 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d516fe4400] Missing reference picture, default is 65530 [h264 @ 0x55d516fe4400] Missing reference picture, default is 65530 [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215b35e3c0] Missing reference picture, default is 65530 [h264 @ 0x56215b35e3c0] Missing reference picture, default is 65530 [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 13400 unjoint_samples 13400 joint_samples 40 [275376, 1047349] [h264 @ 0x55d517608ac0] mmco: unref short failure processed_samples 13400 unjoint_samples 13400 joint_samples 40 [96442, 1045599] processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1028881, 803492] processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046275, 632442] processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1044023, 7939] processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046397, 549541] processed_samples 13400 unjoint_samples 13400 joint_samples 41 [673681, 835418] [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046469, 2275] processed_samples 13400 unjoint_samples 13400 joint_samples 40 [275376, 1047349] [h264 @ 0x56215b706800] mmco: unref short failure processed_samples 13400 unjoint_samples 13400 joint_samples 40 [96442, 1045599] processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046275, 632442] processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1044023, 7939] processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046469, 2275] processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1028881, 803492] processed_samples 13400 unjoint_samples 13400 joint_samples 40 [1046397, 549541] processed_samples 13400 unjoint_samples 13400 joint_samples 41 [673681, 835418] [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [2024-11-30 16:42:59] iteration 320/ 500 | consumed samples: 2560 | elapsed time per iteration (ms): 645208.1 | throughput per GPU (TFLOP/s/GPU): 80.7 | learning rate: 1.585004E-06 | global batch size: 8 | lm loss: 8.406156E-01 | loss scale: 1.0 | grad norm: 0.720 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (157375.32, 157375.75) [2024-11-30 16:55:10] iteration 321/ 500 | consumed samples: 2568 | elapsed time per iteration (ms): 573666.6 | throughput per GPU (TFLOP/s/GPU): 90.7 | learning rate: 1.570438E-06 | global batch size: 8 | lm loss: 8.569485E-01 | loss scale: 1.0 | grad norm: 0.749 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure processed_samples 13500 unjoint_samples 13500 joint_samples 40 [789281, 1047349] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046469, 307868] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046469, 307868] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046275, 850373] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1044023, 290292] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [789281, 1047349] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1044023, 290292] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [344892, 1045599] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046275, 850373] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [344892, 1045599] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1031113, 1032192] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046397, 1039659] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1031113, 1032192] processed_samples 13500 unjoint_samples 13500 joint_samples 41 [886594, 888903] processed_samples 13500 unjoint_samples 13500 joint_samples 40 [1046397, 1039659] processed_samples 13500 unjoint_samples 13500 joint_samples 41 [886594, 888903] [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215c97ef40] mmco: unref short failure [h264 @ 0x56215c97ef40] mmco: unref short failure [2024-11-30 17:04:49] iteration 322/ 500 | consumed samples: 2576 | elapsed time per iteration (ms): 579528.4 | throughput per GPU (TFLOP/s/GPU): 89.8 | learning rate: 1.555912E-06 | global batch size: 8 | lm loss: 8.402985E-01 | loss scale: 1.0 | grad norm: 0.602 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [2024-11-30 17:13:00] iteration 323/ 500 | consumed samples: 2584 | elapsed time per iteration (ms): 491025.6 | throughput per GPU (TFLOP/s/GPU): 106.0 | learning rate: 1.541428E-06 | global batch size: 8 | lm loss: 8.409790E-01 | loss scale: 1.0 | grad norm: 0.678 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [2024-11-30 17:24:00] iteration 324/ 500 | consumed samples: 2592 | elapsed time per iteration (ms): 659534.5 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 1.526987E-06 | global batch size: 8 | lm loss: 8.275973E-01 | loss scale: 1.0 | grad norm: 0.780 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [2024-11-30 17:33:31] iteration 325/ 500 | consumed samples: 2600 | elapsed time per iteration (ms): 571165.1 | throughput per GPU (TFLOP/s/GPU): 91.1 | learning rate: 1.512588E-06 | global batch size: 8 | lm loss: 8.160167E-01 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [2024-11-30 17:45:33] iteration 326/ 500 | consumed samples: 2608 | elapsed time per iteration (ms): 722290.8 | throughput per GPU (TFLOP/s/GPU): 72.0 | learning rate: 1.498233E-06 | global batch size: 8 | lm loss: 8.057153E-01 | loss scale: 1.0 | grad norm: 0.536 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 13600 unjoint_samples 13600 joint_samples 40 [625978, 1045599] processed_samples 13600 unjoint_samples 13600 joint_samples 40 [625978, 1045599] processed_samples 13600 unjoint_samples 13600 joint_samples 40 [1046469, 616229] processed_samples 13600 unjoint_samples 13600 joint_samples 40 [1046469, 616229] processed_samples 13600 unjoint_samples 13600 joint_samples 41 [196349, 1038113] processed_samples 13600 unjoint_samples 13600 joint_samples 41 [196349, 1038113] processed_samples 13600 unjoint_samples 13600 joint_samples 41 [1045341, 106633] processed_samples 13600 unjoint_samples 13600 joint_samples 41 [1045341, 106633] processed_samples 13600 unjoint_samples 13600 joint_samples 41 [199051, 1045463] processed_samples 13600 unjoint_samples 13600 joint_samples 41 [199051, 1045463] processed_samples 13600 unjoint_samples 13600 joint_samples 42 [1046810, 9387] processed_samples 13600 unjoint_samples 13600 joint_samples 42 [1046810, 9387] processed_samples 13600 unjoint_samples 13600 joint_samples 41 [1046620, 355512] processed_samples 13600 unjoint_samples 13600 joint_samples 40 [1044023, 603098] processed_samples 13600 unjoint_samples 13600 joint_samples 41 [1046620, 355512] processed_samples 13600 unjoint_samples 13600 joint_samples 40 [1044023, 603098] [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56216223ed40] mmco: unref short failure [h264 @ 0x56216223ed40] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure processed_samples 13700 unjoint_samples 13700 joint_samples 41 [1046620, 898755] processed_samples 13700 unjoint_samples 13700 joint_samples 41 [1045341, 466511] processed_samples 13700 unjoint_samples 13700 joint_samples 41 [435373, 1038113] [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure processed_samples 13700 unjoint_samples 13700 joint_samples 42 [1046810, 283747] processed_samples 13700 unjoint_samples 13700 joint_samples 41 [486705, 1045463] processed_samples 13700 unjoint_samples 13700 joint_samples 40 [1046469, 851558] processed_samples 13700 unjoint_samples 13700 joint_samples 40 [1044023, 870758] [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure processed_samples 13700 unjoint_samples 13700 joint_samples 40 [910623, 1045599] [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure processed_samples 13700 unjoint_samples 13700 joint_samples 41 [1046620, 898755] processed_samples 13700 unjoint_samples 13700 joint_samples 41 [1045341, 466511] processed_samples 13700 unjoint_samples 13700 joint_samples 41 [435373, 1038113] [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure processed_samples 13700 unjoint_samples 13700 joint_samples 40 [1046469, 851558] [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure processed_samples 13700 unjoint_samples 13700 joint_samples 42 [1046810, 283747] processed_samples 13700 unjoint_samples 13700 joint_samples 41 [486705, 1045463] processed_samples 13700 unjoint_samples 13700 joint_samples 40 [1044023, 870758] processed_samples 13700 unjoint_samples 13700 joint_samples 40 [910623, 1045599] [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [2024-11-30 17:57:19] iteration 327/ 500 | consumed samples: 2616 | elapsed time per iteration (ms): 705428.7 | throughput per GPU (TFLOP/s/GPU): 73.8 | learning rate: 1.483922E-06 | global batch size: 8 | lm loss: 8.687828E-01 | loss scale: 1.0 | grad norm: 0.626 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [2024-11-30 18:04:37] iteration 328/ 500 | consumed samples: 2624 | elapsed time per iteration (ms): 438551.2 | throughput per GPU (TFLOP/s/GPU): 118.7 | learning rate: 1.469656E-06 | global batch size: 8 | lm loss: 7.690409E-01 | loss scale: 1.0 | grad norm: 0.599 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 18:14:37] iteration 329/ 500 | consumed samples: 2632 | elapsed time per iteration (ms): 599202.9 | throughput per GPU (TFLOP/s/GPU): 86.9 | learning rate: 1.455435E-06 | global batch size: 8 | lm loss: 8.852109E-01 | loss scale: 1.0 | grad norm: 0.685 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [2024-11-30 18:23:39] iteration 330/ 500 | consumed samples: 2640 | elapsed time per iteration (ms): 541993.7 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 1.441260E-06 | global batch size: 8 | lm loss: 7.726980E-01 | loss scale: 1.0 | grad norm: 0.566 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure processed_samples 13800 unjoint_samples 13800 joint_samples 42 [184223, 1045284] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [124485, 1046118] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [1046955, 122399] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [149934, 1047710] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [1045341, 735665] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [702685, 1038113] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [764513, 1045463] processed_samples 13800 unjoint_samples 13800 joint_samples 42 [1046810, 638210] [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure processed_samples 13800 unjoint_samples 13800 joint_samples 42 [184223, 1045284] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [124485, 1046118] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [1046955, 122399] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [149934, 1047710] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [1045341, 735665] processed_samples 13800 unjoint_samples 13800 joint_samples 42 [1046810, 638210] processed_samples 13800 unjoint_samples 13800 joint_samples 41 [702685, 1038113] [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure processed_samples 13800 unjoint_samples 13800 joint_samples 41 [764513, 1045463] [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [2024-11-30 18:34:42] iteration 331/ 500 | consumed samples: 2648 | elapsed time per iteration (ms): 663480.9 | throughput per GPU (TFLOP/s/GPU): 78.4 | learning rate: 1.427131E-06 | global batch size: 8 | lm loss: 7.973595E-01 | loss scale: 1.0 | grad norm: 0.591 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 13900 unjoint_samples 13900 joint_samples 42 [58896, 1047059] processed_samples 13900 unjoint_samples 13900 joint_samples 41 [1046955, 473432] processed_samples 13900 unjoint_samples 13900 joint_samples 41 [1046955, 473432] processed_samples 13900 unjoint_samples 13900 joint_samples 42 [58896, 1047059] processed_samples 13900 unjoint_samples 13900 joint_samples 41 [332452, 1046118] processed_samples 13900 unjoint_samples 13900 joint_samples 41 [409555, 1047710] processed_samples 13900 unjoint_samples 13900 joint_samples 42 [1046810, 900611] processed_samples 13900 unjoint_samples 13900 joint_samples 42 [619416, 1045284] processed_samples 13900 unjoint_samples 13900 joint_samples 41 [1042745, 1045463] processed_samples 13900 unjoint_samples 13900 joint_samples 41 [963526, 1038113] processed_samples 13900 unjoint_samples 13900 joint_samples 41 [332452, 1046118] processed_samples 13900 unjoint_samples 13900 joint_samples 41 [409555, 1047710] processed_samples 13900 unjoint_samples 13900 joint_samples 42 [619416, 1045284] processed_samples 13900 unjoint_samples 13900 joint_samples 42 [1046810, 900611] processed_samples 13900 unjoint_samples 13900 joint_samples 41 [963526, 1038113] processed_samples 13900 unjoint_samples 13900 joint_samples 41 [1042745, 1045463] [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [2024-11-30 18:44:43] iteration 332/ 500 | consumed samples: 2656 | elapsed time per iteration (ms): 600467.4 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 1.413050E-06 | global batch size: 8 | lm loss: 8.575032E-01 | loss scale: 1.0 | grad norm: 0.612 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [2024-11-30 18:53:03] iteration 333/ 500 | consumed samples: 2664 | elapsed time per iteration (ms): 500926.0 | throughput per GPU (TFLOP/s/GPU): 103.9 | learning rate: 1.399016E-06 | global batch size: 8 | lm loss: 8.795822E-01 | loss scale: 1.0 | grad norm: 0.679 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [2024-11-30 19:03:49] iteration 334/ 500 | consumed samples: 2672 | elapsed time per iteration (ms): 645587.3 | throughput per GPU (TFLOP/s/GPU): 80.6 | learning rate: 1.385031E-06 | global batch size: 8 | lm loss: 7.920018E-01 | loss scale: 1.0 | grad norm: 0.643 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure processed_samples 14000 unjoint_samples 14000 joint_samples 42 [1005724, 285725] [h264 @ 0x55d5141f5f40] mmco: unref short failure processed_samples 14000 unjoint_samples 14000 joint_samples 42 [423876, 1047059] [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure processed_samples 14000 unjoint_samples 14000 joint_samples 41 [1046955, 890045] processed_samples 14000 unjoint_samples 14000 joint_samples 43 [153836, 1025384] processed_samples 14000 unjoint_samples 14000 joint_samples 41 [779490, 1047710] processed_samples 14000 unjoint_samples 14000 joint_samples 42 [248842, 1045463] processed_samples 14000 unjoint_samples 14000 joint_samples 41 [592321, 1046118] processed_samples 14000 unjoint_samples 14000 joint_samples 42 [1005724, 285725] processed_samples 14000 unjoint_samples 14000 joint_samples 42 [924084, 1045284] processed_samples 14000 unjoint_samples 14000 joint_samples 42 [423876, 1047059] [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 14000 unjoint_samples 14000 joint_samples 41 [1046955, 890045] processed_samples 14000 unjoint_samples 14000 joint_samples 41 [779490, 1047710] processed_samples 14000 unjoint_samples 14000 joint_samples 43 [153836, 1025384] processed_samples 14000 unjoint_samples 14000 joint_samples 42 [248842, 1045463] processed_samples 14000 unjoint_samples 14000 joint_samples 41 [592321, 1046118] processed_samples 14000 unjoint_samples 14000 joint_samples 42 [924084, 1045284] [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d512f1c480] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [2024-11-30 19:12:18] iteration 335/ 500 | consumed samples: 2680 | elapsed time per iteration (ms): 508795.7 | throughput per GPU (TFLOP/s/GPU): 102.3 | learning rate: 1.371094E-06 | global batch size: 8 | lm loss: 8.916619E-01 | loss scale: 1.0 | grad norm: 0.626 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [2024-11-30 19:20:23] iteration 336/ 500 | consumed samples: 2688 | elapsed time per iteration (ms): 484938.8 | throughput per GPU (TFLOP/s/GPU): 107.3 | learning rate: 1.357207E-06 | global batch size: 8 | lm loss: 8.181082E-01 | loss scale: 1.0 | grad norm: 0.827 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [2024-11-30 19:30:54] iteration 337/ 500 | consumed samples: 2696 | elapsed time per iteration (ms): 631047.4 | throughput per GPU (TFLOP/s/GPU): 82.5 | learning rate: 1.343370E-06 | global batch size: 8 | lm loss: 7.870709E-01 | loss scale: 1.0 | grad norm: 0.616 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5145cc380] mmco: unref short failure processed_samples 14100 unjoint_samples 14100 joint_samples 42 [1046955, 141152] processed_samples 14100 unjoint_samples 14100 joint_samples 42 [1046955, 141152] processed_samples 14100 unjoint_samples 14100 joint_samples 42 [36720, 1047710] processed_samples 14100 unjoint_samples 14100 joint_samples 42 [36720, 1047710] processed_samples 14100 unjoint_samples 14100 joint_samples 43 [183984, 1045284] processed_samples 14100 unjoint_samples 14100 joint_samples 43 [183984, 1045284] processed_samples 14100 unjoint_samples 14100 joint_samples 42 [1005724, 662471] processed_samples 14100 unjoint_samples 14100 joint_samples 42 [1005724, 662471] [h264 @ 0x55d5139fff40] mmco: unref short failure processed_samples 14100 unjoint_samples 14100 joint_samples 42 [882020, 1047059] [h264 @ 0x56215b89c080] mmco: unref short failure processed_samples 14100 unjoint_samples 14100 joint_samples 42 [882020, 1047059] processed_samples 14100 unjoint_samples 14100 joint_samples 41 [920704, 1046118] processed_samples 14100 unjoint_samples 14100 joint_samples 43 [520287, 1025384] processed_samples 14100 unjoint_samples 14100 joint_samples 43 [520287, 1025384] processed_samples 14100 unjoint_samples 14100 joint_samples 42 [552508, 1045463] processed_samples 14100 unjoint_samples 14100 joint_samples 42 [552508, 1045463] processed_samples 14100 unjoint_samples 14100 joint_samples 41 [920704, 1046118] [h264 @ 0x55d516fefc40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [2024-11-30 19:40:45] iteration 338/ 500 | consumed samples: 2704 | elapsed time per iteration (ms): 591547.6 | throughput per GPU (TFLOP/s/GPU): 88.0 | learning rate: 1.329584E-06 | global batch size: 8 | lm loss: 7.846169E-01 | loss scale: 1.0 | grad norm: 0.579 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [2024-11-30 19:49:58] iteration 339/ 500 | consumed samples: 2712 | elapsed time per iteration (ms): 552124.6 | throughput per GPU (TFLOP/s/GPU): 94.3 | learning rate: 1.315849E-06 | global batch size: 8 | lm loss: 7.516593E-01 | loss scale: 1.0 | grad norm: 0.736 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure processed_samples 14200 unjoint_samples 14200 joint_samples 42 [159121, 1046118] processed_samples 14200 unjoint_samples 14200 joint_samples 42 [159121, 1046118] processed_samples 14200 unjoint_samples 14200 joint_samples 43 [131062, 1047059] processed_samples 14200 unjoint_samples 14200 joint_samples 42 [434399, 1047710] processed_samples 14200 unjoint_samples 14200 joint_samples 43 [131062, 1047059] processed_samples 14200 unjoint_samples 14200 joint_samples 42 [434399, 1047710] processed_samples 14200 unjoint_samples 14200 joint_samples 43 [453173, 1045284] processed_samples 14200 unjoint_samples 14200 joint_samples 43 [453173, 1045284] processed_samples 14200 unjoint_samples 14200 joint_samples 42 [1046955, 515088] processed_samples 14200 unjoint_samples 14200 joint_samples 42 [1046955, 515088] processed_samples 14200 unjoint_samples 14200 joint_samples 43 [919180, 1025384] processed_samples 14200 unjoint_samples 14200 joint_samples 43 [919180, 1025384] processed_samples 14200 unjoint_samples 14200 joint_samples 42 [1005724, 943910] processed_samples 14200 unjoint_samples 14200 joint_samples 42 [1005724, 943910] [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure processed_samples 14200 unjoint_samples 14200 joint_samples 42 [892352, 1045463] processed_samples 14200 unjoint_samples 14200 joint_samples 42 [892352, 1045463] [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215ba43980] mmco: unref short failure [h264 @ 0x56215ba43980] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215ba43980] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [2024-11-30 20:00:32] iteration 340/ 500 | consumed samples: 2720 | elapsed time per iteration (ms): 634723.9 | throughput per GPU (TFLOP/s/GPU): 82.0 | learning rate: 1.302166E-06 | global batch size: 8 | lm loss: 7.638261E-01 | loss scale: 1.0 | grad norm: 0.654 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (263589.57, 263590.48) [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [2024-11-30 20:12:11] iteration 341/ 500 | consumed samples: 2728 | elapsed time per iteration (ms): 435338.3 | throughput per GPU (TFLOP/s/GPU): 119.5 | learning rate: 1.288535E-06 | global batch size: 8 | lm loss: 8.274179E-01 | loss scale: 1.0 | grad norm: 0.595 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [2024-11-30 20:22:15] iteration 342/ 500 | consumed samples: 2736 | elapsed time per iteration (ms): 603582.7 | throughput per GPU (TFLOP/s/GPU): 86.2 | learning rate: 1.274957E-06 | global batch size: 8 | lm loss: 8.112563E-01 | loss scale: 1.0 | grad norm: 0.836 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure processed_samples 14300 unjoint_samples 14300 joint_samples 44 [202728, 1042201] processed_samples 14300 unjoint_samples 14300 joint_samples 43 [76695, 1047559] processed_samples 14300 unjoint_samples 14300 joint_samples 44 [202728, 1042201] processed_samples 14300 unjoint_samples 14300 joint_samples 43 [1043636, 197466] processed_samples 14300 unjoint_samples 14300 joint_samples 43 [76695, 1047559] processed_samples 14300 unjoint_samples 14300 joint_samples 42 [527394, 1046118] processed_samples 14300 unjoint_samples 14300 joint_samples 43 [1043636, 197466] processed_samples 14300 unjoint_samples 14300 joint_samples 42 [1046955, 813807] processed_samples 14300 unjoint_samples 14300 joint_samples 43 [389192, 1047059] processed_samples 14300 unjoint_samples 14300 joint_samples 42 [527394, 1046118] processed_samples 14300 unjoint_samples 14300 joint_samples 43 [389192, 1047059] processed_samples 14300 unjoint_samples 14300 joint_samples 42 [839538, 1047710] processed_samples 14300 unjoint_samples 14300 joint_samples 42 [839538, 1047710] processed_samples 14300 unjoint_samples 14300 joint_samples 42 [1046955, 813807] processed_samples 14300 unjoint_samples 14300 joint_samples 43 [767251, 1045284] [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 14300 unjoint_samples 14300 joint_samples 43 [767251, 1045284] [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [2024-11-30 20:31:53] iteration 343/ 500 | consumed samples: 2744 | elapsed time per iteration (ms): 578439.8 | throughput per GPU (TFLOP/s/GPU): 90.0 | learning rate: 1.261432E-06 | global batch size: 8 | lm loss: 8.647650E-01 | loss scale: 1.0 | grad norm: 0.580 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [2024-11-30 20:40:44] iteration 344/ 500 | consumed samples: 2752 | elapsed time per iteration (ms): 530650.1 | throughput per GPU (TFLOP/s/GPU): 98.1 | learning rate: 1.247961E-06 | global batch size: 8 | lm loss: 8.212827E-01 | loss scale: 1.0 | grad norm: 0.617 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [2024-11-30 20:49:18] iteration 345/ 500 | consumed samples: 2760 | elapsed time per iteration (ms): 513944.1 | throughput per GPU (TFLOP/s/GPU): 101.3 | learning rate: 1.234546E-06 | global batch size: 8 | lm loss: 8.558559E-01 | loss scale: 1.0 | grad norm: 0.776 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure processed_samples 14400 unjoint_samples 14400 joint_samples 43 [1047213, 59291] processed_samples 14400 unjoint_samples 14400 joint_samples 43 [110012, 1030169] processed_samples 14400 unjoint_samples 14400 joint_samples 44 [84987, 1046776] processed_samples 14400 unjoint_samples 14400 joint_samples 43 [1047213, 59291] processed_samples 14400 unjoint_samples 14400 joint_samples 43 [1043636, 451790] processed_samples 14400 unjoint_samples 14400 joint_samples 43 [110012, 1030169] processed_samples 14400 unjoint_samples 14400 joint_samples 44 [578833, 1042201] processed_samples 14400 unjoint_samples 14400 joint_samples 44 [84987, 1046776] processed_samples 14400 unjoint_samples 14400 joint_samples 43 [690881, 1047059] processed_samples 14400 unjoint_samples 14400 joint_samples 43 [349469, 1047559] processed_samples 14400 unjoint_samples 14400 joint_samples 42 [930189, 1046118] processed_samples 14400 unjoint_samples 14400 joint_samples 44 [578833, 1042201] processed_samples 14400 unjoint_samples 14400 joint_samples 43 [1043636, 451790] processed_samples 14400 unjoint_samples 14400 joint_samples 43 [690881, 1047059] processed_samples 14400 unjoint_samples 14400 joint_samples 43 [349469, 1047559] processed_samples 14400 unjoint_samples 14400 joint_samples 42 [930189, 1046118] [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [2024-11-30 21:00:13] iteration 346/ 500 | consumed samples: 2768 | elapsed time per iteration (ms): 655458.5 | throughput per GPU (TFLOP/s/GPU): 79.4 | learning rate: 1.221185E-06 | global batch size: 8 | lm loss: 8.408645E-01 | loss scale: 1.0 | grad norm: 0.750 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d517059440] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d517059440] mmco: unref short failure [h264 @ 0x55d517059440] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1047213, 493755] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1047213, 493755] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [156327, 1048242] processed_samples 14500 unjoint_samples 14500 joint_samples 44 [428707, 1046776] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [351813, 1030169] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [156327, 1048242] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1022921, 1047059] processed_samples 14500 unjoint_samples 14500 joint_samples 44 [428707, 1046776] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [351813, 1030169] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1022921, 1047059] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1043636, 755528] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [649191, 1047559] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [1043636, 755528] processed_samples 14500 unjoint_samples 14500 joint_samples 44 [926515, 1042201] processed_samples 14500 unjoint_samples 14500 joint_samples 44 [926515, 1042201] processed_samples 14500 unjoint_samples 14500 joint_samples 43 [649191, 1047559] [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [2024-11-30 21:09:08] iteration 347/ 500 | consumed samples: 2776 | elapsed time per iteration (ms): 534217.7 | throughput per GPU (TFLOP/s/GPU): 97.4 | learning rate: 1.207880E-06 | global batch size: 8 | lm loss: 8.221813E-01 | loss scale: 1.0 | grad norm: 0.547 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215de57ac0] mmco: unref short failure [h264 @ 0x56215de57ac0] mmco: unref short failure [h264 @ 0x56215de57ac0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [2024-11-30 21:19:18] iteration 348/ 500 | consumed samples: 2784 | elapsed time per iteration (ms): 610608.1 | throughput per GPU (TFLOP/s/GPU): 85.2 | learning rate: 1.194631E-06 | global batch size: 8 | lm loss: 7.808754E-01 | loss scale: 1.0 | grad norm: 0.618 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure processed_samples 14600 unjoint_samples 14600 joint_samples 44 [1046605, 187775] processed_samples 14600 unjoint_samples 14600 joint_samples 43 [468598, 1048242] processed_samples 14600 unjoint_samples 14600 joint_samples 45 [122465, 1046598] processed_samples 14600 unjoint_samples 14600 joint_samples 43 [1047213, 762387] processed_samples 14600 unjoint_samples 14600 joint_samples 43 [957133, 1047559] processed_samples 14600 unjoint_samples 14600 joint_samples 44 [716648, 1046776] processed_samples 14600 unjoint_samples 14600 joint_samples 43 [668285, 1030169] [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure processed_samples 14600 unjoint_samples 14600 joint_samples 43 [1043636, 1024674] [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure processed_samples 14600 unjoint_samples 14600 joint_samples 43 [468598, 1048242] processed_samples 14600 unjoint_samples 14600 joint_samples 44 [1046605, 187775] processed_samples 14600 unjoint_samples 14600 joint_samples 45 [122465, 1046598] [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure processed_samples 14600 unjoint_samples 14600 joint_samples 43 [1047213, 762387] [h264 @ 0x55d513a64780] mmco: unref short failure processed_samples 14600 unjoint_samples 14600 joint_samples 44 [716648, 1046776] processed_samples 14600 unjoint_samples 14600 joint_samples 43 [668285, 1030169] processed_samples 14600 unjoint_samples 14600 joint_samples 43 [1043636, 1024674] processed_samples 14600 unjoint_samples 14600 joint_samples 43 [957133, 1047559] [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [2024-11-30 21:28:44] iteration 349/ 500 | consumed samples: 2792 | elapsed time per iteration (ms): 565549.1 | throughput per GPU (TFLOP/s/GPU): 92.0 | learning rate: 1.181440E-06 | global batch size: 8 | lm loss: 8.365794E-01 | loss scale: 1.0 | grad norm: 0.707 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [2024-11-30 21:39:35] iteration 350/ 500 | consumed samples: 2800 | elapsed time per iteration (ms): 651391.3 | throughput per GPU (TFLOP/s/GPU): 79.9 | learning rate: 1.168305E-06 | global batch size: 8 | lm loss: 8.073776E-01 | loss scale: 1.0 | grad norm: 0.757 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [2024-11-30 21:48:46] iteration 351/ 500 | consumed samples: 2808 | elapsed time per iteration (ms): 550324.4 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 1.155229E-06 | global batch size: 8 | lm loss: 8.848139E-01 | loss scale: 1.0 | grad norm: 1.070 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [2024-11-30 21:57:55] iteration 352/ 500 | consumed samples: 2816 | elapsed time per iteration (ms): 549263.8 | throughput per GPU (TFLOP/s/GPU): 94.7 | learning rate: 1.142211E-06 | global batch size: 8 | lm loss: 8.049350E-01 | loss scale: 1.0 | grad norm: 0.612 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure processed_samples 14700 unjoint_samples 14700 joint_samples 44 [11089, 1046570] processed_samples 14700 unjoint_samples 14700 joint_samples 44 [11089, 1046570] processed_samples 14700 unjoint_samples 14700 joint_samples 45 [1039184, 79494] processed_samples 14700 unjoint_samples 14700 joint_samples 45 [1039184, 79494] processed_samples 14700 unjoint_samples 14700 joint_samples 44 [335570, 1047559] processed_samples 14700 unjoint_samples 14700 joint_samples 44 [1046605, 412179] processed_samples 14700 unjoint_samples 14700 joint_samples 45 [477050, 1046598] processed_samples 14700 unjoint_samples 14700 joint_samples 44 [335570, 1047559] processed_samples 14700 unjoint_samples 14700 joint_samples 45 [477050, 1046598] processed_samples 14700 unjoint_samples 14700 joint_samples 44 [1046605, 412179] processed_samples 14700 unjoint_samples 14700 joint_samples 44 [1047404, 344796] processed_samples 14700 unjoint_samples 14700 joint_samples 44 [1047404, 344796] processed_samples 14700 unjoint_samples 14700 joint_samples 43 [788256, 1048242] processed_samples 14700 unjoint_samples 14700 joint_samples 43 [788256, 1048242] processed_samples 14700 unjoint_samples 14700 joint_samples 43 [992955, 1030169] processed_samples 14700 unjoint_samples 14700 joint_samples 43 [992955, 1030169] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [2024-11-30 22:06:26] iteration 353/ 500 | consumed samples: 2824 | elapsed time per iteration (ms): 511600.3 | throughput per GPU (TFLOP/s/GPU): 101.7 | learning rate: 1.129252E-06 | global batch size: 8 | lm loss: 8.550798E-01 | loss scale: 1.0 | grad norm: 0.775 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [2024-11-30 22:17:02] iteration 354/ 500 | consumed samples: 2832 | elapsed time per iteration (ms): 635950.4 | throughput per GPU (TFLOP/s/GPU): 81.8 | learning rate: 1.116353E-06 | global batch size: 8 | lm loss: 8.067714E-01 | loss scale: 1.0 | grad norm: 0.676 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d512d490c0] mmco: unref short failure [h264 @ 0x55d512d490c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d512d490c0] mmco: unref short failure [h264 @ 0x55d512d490c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215c0ec900] mmco: unref short failure [h264 @ 0x56215c0ec900] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d516ea6140] mmco: unref short failure [h264 @ 0x56215c3b4780] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure processed_samples 14800 unjoint_samples 14800 joint_samples 45 [723060, 1046598] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [92635, 1048242] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [239637, 1047448] processed_samples 14800 unjoint_samples 14800 joint_samples 45 [723060, 1046598] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [239637, 1047448] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [92635, 1048242] processed_samples 14800 unjoint_samples 14800 joint_samples 45 [1039184, 429369] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [633469, 1047559] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [329384, 1046570] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [329384, 1046570] processed_samples 14800 unjoint_samples 14800 joint_samples 45 [1039184, 429369] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [1047404, 622276] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [633469, 1047559] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [1046605, 710576] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [1047404, 622276] processed_samples 14800 unjoint_samples 14800 joint_samples 44 [1046605, 710576] [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-11-30 22:27:58] iteration 355/ 500 | consumed samples: 2840 | elapsed time per iteration (ms): 656031.8 | throughput per GPU (TFLOP/s/GPU): 79.3 | learning rate: 1.103514E-06 | global batch size: 8 | lm loss: 8.307241E-01 | loss scale: 1.0 | grad norm: 0.578 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure processed_samples 14900 unjoint_samples 14900 joint_samples 44 [533642, 1047448] processed_samples 14900 unjoint_samples 14900 joint_samples 46 [46039, 1046659] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [980286, 1047559] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [610685, 1046570] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [533642, 1047448] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [339028, 1048242] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [1046605, 990142] processed_samples 14900 unjoint_samples 14900 joint_samples 45 [1039184, 814639] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [980286, 1047559] processed_samples 14900 unjoint_samples 14900 joint_samples 46 [46039, 1046659] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [1047404, 983297] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [1047404, 983297] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [339028, 1048242] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [610685, 1046570] processed_samples 14900 unjoint_samples 14900 joint_samples 44 [1046605, 990142] processed_samples 14900 unjoint_samples 14900 joint_samples 45 [1039184, 814639] [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [2024-11-30 22:36:51] iteration 356/ 500 | consumed samples: 2848 | elapsed time per iteration (ms): 532141.4 | throughput per GPU (TFLOP/s/GPU): 97.8 | learning rate: 1.090736E-06 | global batch size: 8 | lm loss: 8.175163E-01 | loss scale: 1.0 | grad norm: 0.677 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-11-30 22:46:26] iteration 357/ 500 | consumed samples: 2856 | elapsed time per iteration (ms): 575283.9 | throughput per GPU (TFLOP/s/GPU): 90.5 | learning rate: 1.078019E-06 | global batch size: 8 | lm loss: 8.372152E-01 | loss scale: 1.0 | grad norm: 0.692 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [2024-11-30 22:58:32] iteration 358/ 500 | consumed samples: 2864 | elapsed time per iteration (ms): 725842.3 | throughput per GPU (TFLOP/s/GPU): 71.7 | learning rate: 1.065363E-06 | global batch size: 8 | lm loss: 8.198099E-01 | loss scale: 1.0 | grad norm: 0.548 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [2024-11-30 23:06:27] iteration 359/ 500 | consumed samples: 2872 | elapsed time per iteration (ms): 475503.6 | throughput per GPU (TFLOP/s/GPU): 109.4 | learning rate: 1.052770E-06 | global batch size: 8 | lm loss: 8.032740E-01 | loss scale: 1.0 | grad norm: 0.604 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure processed_samples 15000 unjoint_samples 15000 joint_samples 46 [67811, 1044908] processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1047404, 204719] processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1046605, 194912] processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1026421, 275715] [h264 @ 0x56215cbc4280] mmco: unref short failure processed_samples 15000 unjoint_samples 15000 joint_samples 44 [696351, 1048242] processed_samples 15000 unjoint_samples 15000 joint_samples 46 [299825, 1046659] processed_samples 15000 unjoint_samples 15000 joint_samples 44 [939138, 1047448] processed_samples 15000 unjoint_samples 15000 joint_samples 44 [936282, 1046570] processed_samples 15000 unjoint_samples 15000 joint_samples 46 [299825, 1046659] processed_samples 15000 unjoint_samples 15000 joint_samples 44 [696351, 1048242] processed_samples 15000 unjoint_samples 15000 joint_samples 46 [67811, 1044908] processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1046605, 194912] processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1047404, 204719] processed_samples 15000 unjoint_samples 15000 joint_samples 45 [1026421, 275715] [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 15000 unjoint_samples 15000 joint_samples 44 [936282, 1046570] processed_samples 15000 unjoint_samples 15000 joint_samples 44 [939138, 1047448] [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [2024-11-30 23:14:58] iteration 360/ 500 | consumed samples: 2880 | elapsed time per iteration (ms): 511060.0 | throughput per GPU (TFLOP/s/GPU): 101.8 | learning rate: 1.040240E-06 | global batch size: 8 | lm loss: 8.227824E-01 | loss scale: 1.0 | grad norm: 0.751 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (257612.46, 257612.88) [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [2024-11-30 23:26:27] iteration 361/ 500 | consumed samples: 2888 | elapsed time per iteration (ms): 430482.0 | throughput per GPU (TFLOP/s/GPU): 120.9 | learning rate: 1.027773E-06 | global batch size: 8 | lm loss: 8.282033E-01 | loss scale: 1.0 | grad norm: 0.768 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure processed_samples 15100 unjoint_samples 15100 joint_samples 45 [161612, 1047448] processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1042748, 177357] processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1042748, 177357] processed_samples 15100 unjoint_samples 15100 joint_samples 45 [161612, 1047448] processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1046605, 525556] processed_samples 15100 unjoint_samples 15100 joint_samples 46 [429458, 1044908] processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1046605, 525556] [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure processed_samples 15100 unjoint_samples 15100 joint_samples 46 [429458, 1044908] processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1026421, 554426] processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1026421, 554426] processed_samples 15100 unjoint_samples 15100 joint_samples 46 [643541, 1046659] processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1047404, 507909] processed_samples 15100 unjoint_samples 15100 joint_samples 45 [1047404, 507909] processed_samples 15100 unjoint_samples 15100 joint_samples 46 [643541, 1046659] processed_samples 15100 unjoint_samples 15100 joint_samples 44 [984374, 1048242] processed_samples 15100 unjoint_samples 15100 joint_samples 44 [984374, 1048242] [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [2024-11-30 23:35:06] iteration 362/ 500 | consumed samples: 2896 | elapsed time per iteration (ms): 519023.1 | throughput per GPU (TFLOP/s/GPU): 100.3 | learning rate: 1.015370E-06 | global batch size: 8 | lm loss: 8.181028E-01 | loss scale: 1.0 | grad norm: 0.796 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure processed_samples 15200 unjoint_samples 15200 joint_samples 45 [407349, 1047448] processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1048022, 204247] processed_samples 15200 unjoint_samples 15200 joint_samples 46 [646524, 1044908] processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1026421, 901311] processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1042748, 568415] processed_samples 15200 unjoint_samples 15200 joint_samples 46 [911276, 1046659] processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1046605, 868111] processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1047404, 759171] [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure processed_samples 15200 unjoint_samples 15200 joint_samples 45 [407349, 1047448] processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1048022, 204247] processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1042748, 568415] processed_samples 15200 unjoint_samples 15200 joint_samples 46 [911276, 1046659] processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1026421, 901311] processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1046605, 868111] processed_samples 15200 unjoint_samples 15200 joint_samples 45 [1047404, 759171] processed_samples 15200 unjoint_samples 15200 joint_samples 46 [646524, 1044908] [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d517d67ec0] mmco: unref short failure [h264 @ 0x55d517d67ec0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-11-30 23:47:32] iteration 363/ 500 | consumed samples: 2904 | elapsed time per iteration (ms): 745984.1 | throughput per GPU (TFLOP/s/GPU): 69.8 | learning rate: 1.003032E-06 | global batch size: 8 | lm loss: 7.587072E-01 | loss scale: 1.0 | grad norm: 0.632 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [2024-11-30 23:58:28] iteration 364/ 500 | consumed samples: 2912 | elapsed time per iteration (ms): 656084.8 | throughput per GPU (TFLOP/s/GPU): 79.3 | learning rate: 9.907581E-07 | global batch size: 8 | lm loss: 8.105970E-01 | loss scale: 1.0 | grad norm: 0.607 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 00:08:00] iteration 365/ 500 | consumed samples: 2920 | elapsed time per iteration (ms): 571895.2 | throughput per GPU (TFLOP/s/GPU): 91.0 | learning rate: 9.785499E-07 | global batch size: 8 | lm loss: 8.919947E-01 | loss scale: 1.0 | grad norm: 0.746 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure processed_samples 15300 unjoint_samples 15300 joint_samples 46 [1046605, 90877] processed_samples 15300 unjoint_samples 15300 joint_samples 45 [642808, 1047448] processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1048022, 581832] [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure processed_samples 15300 unjoint_samples 15300 joint_samples 47 [103812, 1046659] [h264 @ 0x55d51478a440] mmco: unref short failure processed_samples 15300 unjoint_samples 15300 joint_samples 46 [96183, 1043822] processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1047404, 1034632] processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1042748, 830319] [h264 @ 0x55d5130aa740] mmco: unref short failure processed_samples 15300 unjoint_samples 15300 joint_samples 46 [1046605, 90877] processed_samples 15300 unjoint_samples 15300 joint_samples 46 [940276, 1044908] [h264 @ 0x56215d4974c0] mmco: unref short failure processed_samples 15300 unjoint_samples 15300 joint_samples 45 [642808, 1047448] processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1048022, 581832] processed_samples 15300 unjoint_samples 15300 joint_samples 46 [96183, 1043822] processed_samples 15300 unjoint_samples 15300 joint_samples 47 [103812, 1046659] processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1042748, 830319] processed_samples 15300 unjoint_samples 15300 joint_samples 46 [940276, 1044908] processed_samples 15300 unjoint_samples 15300 joint_samples 45 [1047404, 1034632] [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [2024-12-01 00:16:26] iteration 366/ 500 | consumed samples: 2928 | elapsed time per iteration (ms): 506663.6 | throughput per GPU (TFLOP/s/GPU): 102.7 | learning rate: 9.664075E-07 | global batch size: 8 | lm loss: 8.039944E-01 | loss scale: 1.0 | grad norm: 0.731 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [2024-12-01 00:25:33] iteration 367/ 500 | consumed samples: 2936 | elapsed time per iteration (ms): 546381.3 | throughput per GPU (TFLOP/s/GPU): 95.2 | learning rate: 9.543316E-07 | global batch size: 8 | lm loss: 7.765344E-01 | loss scale: 1.0 | grad norm: 0.548 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [2024-12-01 00:33:11] iteration 368/ 500 | consumed samples: 2944 | elapsed time per iteration (ms): 458843.2 | throughput per GPU (TFLOP/s/GPU): 113.4 | learning rate: 9.423227E-07 | global batch size: 8 | lm loss: 7.737644E-01 | loss scale: 1.0 | grad norm: 4.278 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 15400 unjoint_samples 15400 joint_samples 46 [379375, 1043822] processed_samples 15400 unjoint_samples 15400 joint_samples 45 [1048022, 1045809] processed_samples 15400 unjoint_samples 15400 joint_samples 46 [1047411, 28274] processed_samples 15400 unjoint_samples 15400 joint_samples 47 [1047529, 151070] processed_samples 15400 unjoint_samples 15400 joint_samples 46 [263010, 1047821] processed_samples 15400 unjoint_samples 15400 joint_samples 45 [994220, 1047448] processed_samples 15400 unjoint_samples 15400 joint_samples 47 [467656, 1046659] processed_samples 15400 unjoint_samples 15400 joint_samples 45 [1048022, 1045809] processed_samples 15400 unjoint_samples 15400 joint_samples 46 [379375, 1043822] processed_samples 15400 unjoint_samples 15400 joint_samples 46 [1047411, 28274] processed_samples 15400 unjoint_samples 15400 joint_samples 47 [1047529, 151070] processed_samples 15400 unjoint_samples 15400 joint_samples 47 [467656, 1046659] processed_samples 15400 unjoint_samples 15400 joint_samples 46 [263010, 1047821] processed_samples 15400 unjoint_samples 15400 joint_samples 46 [1046605, 386686] processed_samples 15400 unjoint_samples 15400 joint_samples 46 [1046605, 386686] processed_samples 15400 unjoint_samples 15400 joint_samples 45 [994220, 1047448] [2024-12-01 00:41:17] iteration 369/ 500 | consumed samples: 2952 | elapsed time per iteration (ms): 485235.8 | throughput per GPU (TFLOP/s/GPU): 107.2 | learning rate: 9.303812E-07 | global batch size: 8 | lm loss: 7.778735E-01 | loss scale: 1.0 | grad norm: 0.625 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [2024-12-01 00:50:04] iteration 370/ 500 | consumed samples: 2960 | elapsed time per iteration (ms): 526954.5 | throughput per GPU (TFLOP/s/GPU): 98.8 | learning rate: 9.185077E-07 | global batch size: 8 | lm loss: 8.354955E-01 | loss scale: 1.0 | grad norm: 0.739 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215eca3f80] mmco: unref short failure [h264 @ 0x56215eca3f80] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1047411, 345187] processed_samples 15500 unjoint_samples 15500 joint_samples 47 [1047529, 476934] processed_samples 15500 unjoint_samples 15500 joint_samples 46 [404461, 1045809] processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1046514, 227331] processed_samples 15500 unjoint_samples 15500 joint_samples 46 [564845, 1047821] [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1046605, 706854] processed_samples 15500 unjoint_samples 15500 joint_samples 47 [784848, 1046659] [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure processed_samples 15500 unjoint_samples 15500 joint_samples 46 [653970, 1043822] [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure processed_samples 15500 unjoint_samples 15500 joint_samples 46 [404461, 1045809] processed_samples 15500 unjoint_samples 15500 joint_samples 46 [564845, 1047821] processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1047411, 345187] processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1046514, 227331] processed_samples 15500 unjoint_samples 15500 joint_samples 47 [1047529, 476934] processed_samples 15500 unjoint_samples 15500 joint_samples 46 [1046605, 706854] processed_samples 15500 unjoint_samples 15500 joint_samples 46 [653970, 1043822] processed_samples 15500 unjoint_samples 15500 joint_samples 47 [784848, 1046659] [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x55d51459dd80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d51459dd80] mmco: unref short failure [h264 @ 0x55d51459dd80] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51459dd80] mmco: unref short failure [h264 @ 0x55d51459dd80] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [2024-12-01 01:02:18] iteration 371/ 500 | consumed samples: 2968 | elapsed time per iteration (ms): 734588.2 | throughput per GPU (TFLOP/s/GPU): 70.8 | learning rate: 9.067026E-07 | global batch size: 8 | lm loss: 8.195457E-01 | loss scale: 1.0 | grad norm: 0.665 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 01:15:02] iteration 372/ 500 | consumed samples: 2976 | elapsed time per iteration (ms): 763327.3 | throughput per GPU (TFLOP/s/GPU): 68.2 | learning rate: 8.949665E-07 | global batch size: 8 | lm loss: 7.728306E-01 | loss scale: 1.0 | grad norm: 0.628 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1046514, 489869] processed_samples 15600 unjoint_samples 15600 joint_samples 46 [728191, 1045809] processed_samples 15600 unjoint_samples 15600 joint_samples 47 [15386, 1048015] processed_samples 15600 unjoint_samples 15600 joint_samples 48 [18458, 1046659] processed_samples 15600 unjoint_samples 15600 joint_samples 47 [1047529, 849609] processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1046514, 489869] processed_samples 15600 unjoint_samples 15600 joint_samples 47 [15386, 1048015] processed_samples 15600 unjoint_samples 15600 joint_samples 46 [728191, 1045809] processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1047411, 713963] processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1046605, 1016691] processed_samples 15600 unjoint_samples 15600 joint_samples 48 [18458, 1046659] processed_samples 15600 unjoint_samples 15600 joint_samples 46 [818786, 1047821] [h264 @ 0x55d513a4c280] mmco: unref short failure processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1047411, 713963] processed_samples 15600 unjoint_samples 15600 joint_samples 46 [818786, 1047821] [h264 @ 0x56215d4974c0] mmco: unref short failure processed_samples 15600 unjoint_samples 15600 joint_samples 46 [1046605, 1016691] processed_samples 15600 unjoint_samples 15600 joint_samples 47 [1047529, 849609] [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [2024-12-01 01:25:36] iteration 373/ 500 | consumed samples: 2984 | elapsed time per iteration (ms): 634333.7 | throughput per GPU (TFLOP/s/GPU): 82.0 | learning rate: 8.832998E-07 | global batch size: 8 | lm loss: 8.350987E-01 | loss scale: 1.0 | grad norm: 0.681 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d513a4c280] mmco: unref short failure [h264 @ 0x55d513a4c280] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [2024-12-01 01:37:07] iteration 374/ 500 | consumed samples: 2992 | elapsed time per iteration (ms): 691168.5 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 8.717031E-07 | global batch size: 8 | lm loss: 7.817224E-01 | loss scale: 1.0 | grad norm: 0.547 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [2024-12-01 01:44:03] iteration 375/ 500 | consumed samples: 3000 | elapsed time per iteration (ms): 415366.5 | throughput per GPU (TFLOP/s/GPU): 125.3 | learning rate: 8.601767E-07 | global batch size: 8 | lm loss: 7.525834E-01 | loss scale: 1.0 | grad norm: 0.954 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1046514, 751262] processed_samples 15700 unjoint_samples 15700 joint_samples 48 [1047529, 257143] processed_samples 15700 unjoint_samples 15700 joint_samples 47 [1041522, 131458] processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1047411, 1004159] processed_samples 15700 unjoint_samples 15700 joint_samples 47 [1046605, 275662] processed_samples 15700 unjoint_samples 15700 joint_samples 47 [284312, 1048015] processed_samples 15700 unjoint_samples 15700 joint_samples 48 [247417, 1046659] processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1013183, 1045809] processed_samples 15700 unjoint_samples 15700 joint_samples 48 [1047529, 257143] processed_samples 15700 unjoint_samples 15700 joint_samples 47 [1041522, 131458] processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1046514, 751262] processed_samples 15700 unjoint_samples 15700 joint_samples 47 [284312, 1048015] processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1047411, 1004159] processed_samples 15700 unjoint_samples 15700 joint_samples 47 [1046605, 275662] processed_samples 15700 unjoint_samples 15700 joint_samples 48 [247417, 1046659] processed_samples 15700 unjoint_samples 15700 joint_samples 46 [1013183, 1045809] [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [2024-12-01 01:53:10] iteration 376/ 500 | consumed samples: 3008 | elapsed time per iteration (ms): 547870.2 | throughput per GPU (TFLOP/s/GPU): 95.0 | learning rate: 8.487213E-07 | global batch size: 8 | lm loss: 8.182274E-01 | loss scale: 1.0 | grad norm: 0.607 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [2024-12-01 02:02:13] iteration 377/ 500 | consumed samples: 3016 | elapsed time per iteration (ms): 542559.7 | throughput per GPU (TFLOP/s/GPU): 95.9 | learning rate: 8.373373E-07 | global batch size: 8 | lm loss: 8.583971E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215e692c80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d513a4c280] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1042287, 326785] processed_samples 15800 unjoint_samples 15800 joint_samples 48 [1047529, 653291] processed_samples 15800 unjoint_samples 15800 joint_samples 47 [648506, 1048015] processed_samples 15800 unjoint_samples 15800 joint_samples 47 [254576, 1046923] processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1041522, 525114] processed_samples 15800 unjoint_samples 15800 joint_samples 46 [1046514, 1026555] processed_samples 15800 unjoint_samples 15800 joint_samples 48 [619260, 1046659] processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1042287, 326785] processed_samples 15800 unjoint_samples 15800 joint_samples 48 [1047529, 653291] processed_samples 15800 unjoint_samples 15800 joint_samples 47 [648506, 1048015] processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1046605, 680037] processed_samples 15800 unjoint_samples 15800 joint_samples 47 [254576, 1046923] processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1041522, 525114] processed_samples 15800 unjoint_samples 15800 joint_samples 48 [619260, 1046659] processed_samples 15800 unjoint_samples 15800 joint_samples 46 [1046514, 1026555] processed_samples 15800 unjoint_samples 15800 joint_samples 47 [1046605, 680037] [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [2024-12-01 02:11:31] iteration 378/ 500 | consumed samples: 3024 | elapsed time per iteration (ms): 557982.8 | throughput per GPU (TFLOP/s/GPU): 93.3 | learning rate: 8.260251E-07 | global batch size: 8 | lm loss: 8.015327E-01 | loss scale: 1.0 | grad norm: 0.774 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513a4c280] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516effb00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215bc88940] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [2024-12-01 02:25:34] iteration 379/ 500 | consumed samples: 3032 | elapsed time per iteration (ms): 843400.3 | throughput per GPU (TFLOP/s/GPU): 61.7 | learning rate: 8.147852E-07 | global batch size: 8 | lm loss: 7.978020E-01 | loss scale: 1.0 | grad norm: 1.030 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 15900 unjoint_samples 15900 joint_samples 48 [1046605, 83978] processed_samples 15900 unjoint_samples 15900 joint_samples 48 [1046605, 83978] processed_samples 15900 unjoint_samples 15900 joint_samples 47 [1041522, 860416] processed_samples 15900 unjoint_samples 15900 joint_samples 47 [244074, 1048371] processed_samples 15900 unjoint_samples 15900 joint_samples 47 [244074, 1048371] processed_samples 15900 unjoint_samples 15900 joint_samples 47 [545936, 1046923] processed_samples 15900 unjoint_samples 15900 joint_samples 48 [1047529, 936904] processed_samples 15900 unjoint_samples 15900 joint_samples 47 [996543, 1048015] [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure processed_samples 15900 unjoint_samples 15900 joint_samples 47 [996543, 1048015] [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 15900 unjoint_samples 15900 joint_samples 47 [1042287, 724518] processed_samples 15900 unjoint_samples 15900 joint_samples 48 [897266, 1046659] processed_samples 15900 unjoint_samples 15900 joint_samples 47 [1042287, 724518] processed_samples 15900 unjoint_samples 15900 joint_samples 47 [545936, 1046923] [h264 @ 0x56215b35e3c0] mmco: unref short failure processed_samples 15900 unjoint_samples 15900 joint_samples 48 [897266, 1046659] processed_samples 15900 unjoint_samples 15900 joint_samples 47 [1041522, 860416] processed_samples 15900 unjoint_samples 15900 joint_samples 48 [1047529, 936904] [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [2024-12-01 02:33:19] iteration 380/ 500 | consumed samples: 3040 | elapsed time per iteration (ms): 464339.8 | throughput per GPU (TFLOP/s/GPU): 112.1 | learning rate: 8.036182E-07 | global batch size: 8 | lm loss: 8.628017E-01 | loss scale: 1.0 | grad norm: 0.700 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (150869.38, 150869.98) [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [2024-12-01 02:45:31] iteration 381/ 500 | consumed samples: 3048 | elapsed time per iteration (ms): 581575.6 | throughput per GPU (TFLOP/s/GPU): 89.5 | learning rate: 7.925244E-07 | global batch size: 8 | lm loss: 8.163093E-01 | loss scale: 1.0 | grad norm: 1.139 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [2024-12-01 02:55:19] iteration 382/ 500 | consumed samples: 3056 | elapsed time per iteration (ms): 587749.5 | throughput per GPU (TFLOP/s/GPU): 88.5 | learning rate: 7.815044E-07 | global batch size: 8 | lm loss: 7.799031E-01 | loss scale: 1.0 | grad norm: 0.672 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x55d51863a080] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [2024-12-01 03:05:34] iteration 383/ 500 | consumed samples: 3064 | elapsed time per iteration (ms): 614570.1 | throughput per GPU (TFLOP/s/GPU): 84.7 | learning rate: 7.705586E-07 | global batch size: 8 | lm loss: 7.895235E-01 | loss scale: 1.0 | grad norm: 0.646 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure processed_samples 16000 unjoint_samples 16000 joint_samples 49 [222990, 1046734] processed_samples 16000 unjoint_samples 16000 joint_samples 48 [52873, 1046859] [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure processed_samples 16000 unjoint_samples 16000 joint_samples 48 [1046605, 351559] processed_samples 16000 unjoint_samples 16000 joint_samples 47 [491351, 1048371] processed_samples 16000 unjoint_samples 16000 joint_samples 49 [222990, 1046734] processed_samples 16000 unjoint_samples 16000 joint_samples 48 [52873, 1046859] processed_samples 16000 unjoint_samples 16000 joint_samples 48 [1046605, 351559] processed_samples 16000 unjoint_samples 16000 joint_samples 47 [491351, 1048371] processed_samples 16000 unjoint_samples 16000 joint_samples 48 [8070, 1046923] processed_samples 16000 unjoint_samples 16000 joint_samples 48 [8070, 1046923] [2024-12-01 03:13:47] iteration 384/ 500 | consumed samples: 3072 | elapsed time per iteration (ms): 493563.5 | throughput per GPU (TFLOP/s/GPU): 105.4 | learning rate: 7.596874E-07 | global batch size: 8 | lm loss: 8.024371E-01 | loss scale: 1.0 | grad norm: 0.695 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 16000 unjoint_samples 16000 joint_samples 49 [201258, 1046659] processed_samples 16000 unjoint_samples 16000 joint_samples 49 [201258, 1046659] processed_samples 16000 unjoint_samples 16000 joint_samples 48 [207561, 1048015] processed_samples 16000 unjoint_samples 16000 joint_samples 48 [207561, 1048015] processed_samples 16000 unjoint_samples 16000 joint_samples 47 [1042287, 1031405] processed_samples 16000 unjoint_samples 16000 joint_samples 47 [1042287, 1031405] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d513a1f200] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [2024-12-01 03:22:55] iteration 385/ 500 | consumed samples: 3080 | elapsed time per iteration (ms): 547368.2 | throughput per GPU (TFLOP/s/GPU): 95.1 | learning rate: 7.488913E-07 | global batch size: 8 | lm loss: 8.267174E-01 | loss scale: 1.0 | grad norm: 0.742 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x55d5137029c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure processed_samples 16100 unjoint_samples 16100 joint_samples 48 [395433, 1040248] processed_samples 16100 unjoint_samples 16100 joint_samples 48 [369698, 1046859] [h264 @ 0x56215be9ff00] mmco: unref short failure processed_samples 16100 unjoint_samples 16100 joint_samples 48 [504219, 1048015] [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure processed_samples 16100 unjoint_samples 16100 joint_samples 48 [343406, 1046923] processed_samples 16100 unjoint_samples 16100 joint_samples 48 [1046605, 639047] processed_samples 16100 unjoint_samples 16100 joint_samples 49 [540322, 1046734] processed_samples 16100 unjoint_samples 16100 joint_samples 47 [809565, 1048371] processed_samples 16100 unjoint_samples 16100 joint_samples 49 [486512, 1046659] [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 16100 unjoint_samples 16100 joint_samples 48 [395433, 1040248] processed_samples 16100 unjoint_samples 16100 joint_samples 49 [540322, 1046734] processed_samples 16100 unjoint_samples 16100 joint_samples 48 [369698, 1046859] processed_samples 16100 unjoint_samples 16100 joint_samples 48 [343406, 1046923] processed_samples 16100 unjoint_samples 16100 joint_samples 48 [504219, 1048015] processed_samples 16100 unjoint_samples 16100 joint_samples 49 [486512, 1046659] processed_samples 16100 unjoint_samples 16100 joint_samples 48 [1046605, 639047] processed_samples 16100 unjoint_samples 16100 joint_samples 47 [809565, 1048371] [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [2024-12-01 03:35:28] iteration 386/ 500 | consumed samples: 3088 | elapsed time per iteration (ms): 753179.4 | throughput per GPU (TFLOP/s/GPU): 69.1 | learning rate: 7.381709E-07 | global batch size: 8 | lm loss: 8.138526E-01 | loss scale: 1.0 | grad norm: 0.833 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51327b000] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 16200 unjoint_samples 16200 joint_samples 48 [1046838, 36849] processed_samples 16200 unjoint_samples 16200 joint_samples 48 [696758, 1046923] processed_samples 16200 unjoint_samples 16200 joint_samples 48 [1046838, 36849] processed_samples 16200 unjoint_samples 16200 joint_samples 48 [715740, 1046859] processed_samples 16200 unjoint_samples 16200 joint_samples 48 [1046605, 893554] processed_samples 16200 unjoint_samples 16200 joint_samples 49 [753503, 1046734] processed_samples 16200 unjoint_samples 16200 joint_samples 48 [696758, 1046923] processed_samples 16200 unjoint_samples 16200 joint_samples 49 [765154, 1046659] processed_samples 16200 unjoint_samples 16200 joint_samples 48 [715740, 1046859] [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 16200 unjoint_samples 16200 joint_samples 49 [753503, 1046734] processed_samples 16200 unjoint_samples 16200 joint_samples 48 [864313, 1048015] processed_samples 16200 unjoint_samples 16200 joint_samples 48 [764185, 1040248] [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure processed_samples 16200 unjoint_samples 16200 joint_samples 48 [1046605, 893554] [h264 @ 0x56215b4e5f40] mmco: unref short failure processed_samples 16200 unjoint_samples 16200 joint_samples 49 [765154, 1046659] processed_samples 16200 unjoint_samples 16200 joint_samples 48 [864313, 1048015] processed_samples 16200 unjoint_samples 16200 joint_samples 48 [764185, 1040248] [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [2024-12-01 03:48:38] iteration 387/ 500 | consumed samples: 3096 | elapsed time per iteration (ms): 790397.8 | throughput per GPU (TFLOP/s/GPU): 65.8 | learning rate: 7.275264E-07 | global batch size: 8 | lm loss: 7.640258E-01 | loss scale: 1.0 | grad norm: 0.539 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [2024-12-01 03:59:48] iteration 388/ 500 | consumed samples: 3104 | elapsed time per iteration (ms): 670250.3 | throughput per GPU (TFLOP/s/GPU): 77.6 | learning rate: 7.169584E-07 | global batch size: 8 | lm loss: 7.533755E-01 | loss scale: 1.0 | grad norm: 0.731 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [2024-12-01 04:10:39] iteration 389/ 500 | consumed samples: 3112 | elapsed time per iteration (ms): 650075.4 | throughput per GPU (TFLOP/s/GPU): 80.1 | learning rate: 7.064673E-07 | global batch size: 8 | lm loss: 8.156929E-01 | loss scale: 1.0 | grad norm: 0.596 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215e692c80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [2024-12-01 04:19:38] iteration 390/ 500 | consumed samples: 3120 | elapsed time per iteration (ms): 538977.8 | throughput per GPU (TFLOP/s/GPU): 96.6 | learning rate: 6.960536E-07 | global batch size: 8 | lm loss: 7.825867E-01 | loss scale: 1.0 | grad norm: 0.666 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 16300 unjoint_samples 16300 joint_samples 50 [1038849, 107087] processed_samples 16300 unjoint_samples 16300 joint_samples 49 [1046605, 224095] processed_samples 16300 unjoint_samples 16300 joint_samples 50 [19272, 1047822] processed_samples 16300 unjoint_samples 16300 joint_samples 49 [1009126, 149600] processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1046838, 447430] processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1021208, 1046859] processed_samples 16300 unjoint_samples 16300 joint_samples 48 [960290, 1046923] processed_samples 16300 unjoint_samples 16300 joint_samples 49 [1046605, 224095] processed_samples 16300 unjoint_samples 16300 joint_samples 50 [1038849, 107087] processed_samples 16300 unjoint_samples 16300 joint_samples 48 [960290, 1046923] processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1029841, 1040248] processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1046838, 447430] processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1021208, 1046859] processed_samples 16300 unjoint_samples 16300 joint_samples 50 [19272, 1047822] processed_samples 16300 unjoint_samples 16300 joint_samples 49 [1009126, 149600] [h264 @ 0x55d51735fc80] mmco: unref short failure processed_samples 16300 unjoint_samples 16300 joint_samples 48 [1029841, 1040248] [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [2024-12-01 04:28:39] iteration 391/ 500 | consumed samples: 3128 | elapsed time per iteration (ms): 541262.2 | throughput per GPU (TFLOP/s/GPU): 96.1 | learning rate: 6.857177E-07 | global batch size: 8 | lm loss: 8.480158E-01 | loss scale: 1.0 | grad norm: 0.797 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 04:37:30] iteration 392/ 500 | consumed samples: 3136 | elapsed time per iteration (ms): 531034.6 | throughput per GPU (TFLOP/s/GPU): 98.0 | learning rate: 6.754599E-07 | global batch size: 8 | lm loss: 8.135203E-01 | loss scale: 1.0 | grad norm: 0.631 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 04:46:42] iteration 393/ 500 | consumed samples: 3144 | elapsed time per iteration (ms): 552274.7 | throughput per GPU (TFLOP/s/GPU): 94.2 | learning rate: 6.652809E-07 | global batch size: 8 | lm loss: 8.489851E-01 | loss scale: 1.0 | grad norm: 0.850 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure processed_samples 16400 unjoint_samples 16400 joint_samples 49 [313617, 1046859] processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046605, 499525] processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046237, 241240] processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1009126, 434271] processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046562, 221364] processed_samples 16400 unjoint_samples 16400 joint_samples 50 [1038849, 521646] [h264 @ 0x56215b6ebf40] mmco: unref short failure processed_samples 16400 unjoint_samples 16400 joint_samples 50 [343985, 1047822] processed_samples 16400 unjoint_samples 16400 joint_samples 48 [1046838, 743582] [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046605, 499525] processed_samples 16400 unjoint_samples 16400 joint_samples 49 [313617, 1046859] processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046237, 241240] processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1046562, 221364] processed_samples 16400 unjoint_samples 16400 joint_samples 50 [1038849, 521646] [h264 @ 0x55d513904400] mmco: unref short failure processed_samples 16400 unjoint_samples 16400 joint_samples 50 [343985, 1047822] processed_samples 16400 unjoint_samples 16400 joint_samples 48 [1046838, 743582] processed_samples 16400 unjoint_samples 16400 joint_samples 49 [1009126, 434271] [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215d200880] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51719ca00] mmco: unref short failure [h264 @ 0x55d51719ca00] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51719ca00] mmco: unref short failure [h264 @ 0x55d51719ca00] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [2024-12-01 04:58:04] iteration 394/ 500 | consumed samples: 3152 | elapsed time per iteration (ms): 681448.5 | throughput per GPU (TFLOP/s/GPU): 76.4 | learning rate: 6.551809E-07 | global batch size: 8 | lm loss: 8.453261E-01 | loss scale: 1.0 | grad norm: 0.673 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1047034, 28245] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046605, 892360] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1047034, 28245] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046237, 457346] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046237, 457346] processed_samples 16500 unjoint_samples 16500 joint_samples 50 [748263, 1047822] processed_samples 16500 unjoint_samples 16500 joint_samples 50 [1038849, 827441] processed_samples 16500 unjoint_samples 16500 joint_samples 50 [748263, 1047822] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046562, 465836] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046562, 465836] processed_samples 16500 unjoint_samples 16500 joint_samples 50 [1038849, 827441] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1046605, 892360] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1009126, 707858] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [1009126, 707858] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [639624, 1046859] processed_samples 16500 unjoint_samples 16500 joint_samples 49 [639624, 1046859] [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x5621619e5680] mmco: unref short failure [h264 @ 0x5621619e5680] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-12-01 05:10:22] iteration 395/ 500 | consumed samples: 3160 | elapsed time per iteration (ms): 738851.2 | throughput per GPU (TFLOP/s/GPU): 70.4 | learning rate: 6.451604E-07 | global batch size: 8 | lm loss: 8.372583E-01 | loss scale: 1.0 | grad norm: 0.717 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [2024-12-01 05:21:54] iteration 396/ 500 | consumed samples: 3168 | elapsed time per iteration (ms): 691548.7 | throughput per GPU (TFLOP/s/GPU): 75.3 | learning rate: 6.352198E-07 | global batch size: 8 | lm loss: 8.565010E-01 | loss scale: 1.0 | grad norm: 0.588 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [2024-12-01 05:32:33] iteration 397/ 500 | consumed samples: 3176 | elapsed time per iteration (ms): 639376.8 | throughput per GPU (TFLOP/s/GPU): 81.4 | learning rate: 6.253596E-07 | global batch size: 8 | lm loss: 7.959479E-01 | loss scale: 1.0 | grad norm: 0.604 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure processed_samples 16600 unjoint_samples 16600 joint_samples 51 [1043499, 26749] processed_samples 16600 unjoint_samples 16600 joint_samples 51 [970568, 335378] processed_samples 16600 unjoint_samples 16600 joint_samples 50 [146901, 1047612] processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1047034, 271818] [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure processed_samples 16600 unjoint_samples 16600 joint_samples 49 [871191, 1046859] processed_samples 16600 unjoint_samples 16600 joint_samples 50 [146901, 1047612] processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1047034, 271818] processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1009126, 997896] processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1046562, 709072] processed_samples 16600 unjoint_samples 16600 joint_samples 51 [970568, 335378] processed_samples 16600 unjoint_samples 16600 joint_samples 51 [1043499, 26749] processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1046562, 709072] processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1046237, 776619] processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1046237, 776619] processed_samples 16600 unjoint_samples 16600 joint_samples 49 [1009126, 997896] processed_samples 16600 unjoint_samples 16600 joint_samples 49 [871191, 1046859] [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [2024-12-01 05:41:49] iteration 398/ 500 | consumed samples: 3184 | elapsed time per iteration (ms): 555342.2 | throughput per GPU (TFLOP/s/GPU): 93.7 | learning rate: 6.155801E-07 | global batch size: 8 | lm loss: 8.679034E-01 | loss scale: 1.0 | grad norm: 1.141 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [2024-12-01 05:53:35] iteration 399/ 500 | consumed samples: 3192 | elapsed time per iteration (ms): 706572.2 | throughput per GPU (TFLOP/s/GPU): 73.7 | learning rate: 6.058818E-07 | global batch size: 8 | lm loss: 8.082343E-01 | loss scale: 1.0 | grad norm: 1.042 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure processed_samples 16700 unjoint_samples 16700 joint_samples 51 [1043499, 491958] processed_samples 16700 unjoint_samples 16700 joint_samples 50 [926586, 345655] [h264 @ 0x55d51a586540] mmco: unref short failure processed_samples 16700 unjoint_samples 16700 joint_samples 50 [369808, 1046503] processed_samples 16700 unjoint_samples 16700 joint_samples 50 [512500, 1047612] processed_samples 16700 unjoint_samples 16700 joint_samples 51 [970568, 643165] processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1046562, 995829] processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1047034, 566436] [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1046237, 1038259] [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure processed_samples 16700 unjoint_samples 16700 joint_samples 50 [926586, 345655] processed_samples 16700 unjoint_samples 16700 joint_samples 50 [512500, 1047612] processed_samples 16700 unjoint_samples 16700 joint_samples 51 [1043499, 491958] processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1047034, 566436] [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 16700 unjoint_samples 16700 joint_samples 50 [369808, 1046503] processed_samples 16700 unjoint_samples 16700 joint_samples 51 [970568, 643165] [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1046562, 995829] processed_samples 16700 unjoint_samples 16700 joint_samples 49 [1046237, 1038259] [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [2024-12-01 06:02:17] iteration 400/ 500 | consumed samples: 3200 | elapsed time per iteration (ms): 521421.3 | throughput per GPU (TFLOP/s/GPU): 99.8 | learning rate: 5.962651E-07 | global batch size: 8 | lm loss: 8.783824E-01 | loss scale: 1.0 | grad norm: 0.670 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (156572.19, 156572.53) [2024-12-01 06:13:08] iteration 401/ 500 | consumed samples: 3208 | elapsed time per iteration (ms): 494537.3 | throughput per GPU (TFLOP/s/GPU): 105.2 | learning rate: 5.867304E-07 | global batch size: 8 | lm loss: 7.950339E-01 | loss scale: 1.0 | grad norm: 0.636 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure processed_samples 16800 unjoint_samples 16800 joint_samples 50 [926586, 644707] processed_samples 16800 unjoint_samples 16800 joint_samples 50 [808536, 1047612] processed_samples 16800 unjoint_samples 16800 joint_samples 50 [1046237, 284440] processed_samples 16800 unjoint_samples 16800 joint_samples 50 [1046562, 244148] processed_samples 16800 unjoint_samples 16800 joint_samples 50 [705796, 1046503] processed_samples 16800 unjoint_samples 16800 joint_samples 51 [970568, 896207] processed_samples 16800 unjoint_samples 16800 joint_samples 49 [1047034, 859904] processed_samples 16800 unjoint_samples 16800 joint_samples 51 [1043499, 732131] [h264 @ 0x55d5145a1340] mmco: unref short failure processed_samples 16800 unjoint_samples 16800 joint_samples 50 [926586, 644707] processed_samples 16800 unjoint_samples 16800 joint_samples 50 [808536, 1047612] processed_samples 16800 unjoint_samples 16800 joint_samples 50 [705796, 1046503] processed_samples 16800 unjoint_samples 16800 joint_samples 50 [1046562, 244148] processed_samples 16800 unjoint_samples 16800 joint_samples 50 [1046237, 284440] processed_samples 16800 unjoint_samples 16800 joint_samples 51 [970568, 896207] processed_samples 16800 unjoint_samples 16800 joint_samples 49 [1047034, 859904] [h264 @ 0x55d517193e00] mmco: unref short failure processed_samples 16800 unjoint_samples 16800 joint_samples 51 [1043499, 732131] [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [2024-12-01 06:25:09] iteration 402/ 500 | consumed samples: 3216 | elapsed time per iteration (ms): 720630.2 | throughput per GPU (TFLOP/s/GPU): 72.2 | learning rate: 5.772780E-07 | global batch size: 8 | lm loss: 8.785825E-01 | loss scale: 1.0 | grad norm: 0.656 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [2024-12-01 06:38:12] iteration 403/ 500 | consumed samples: 3224 | elapsed time per iteration (ms): 783034.3 | throughput per GPU (TFLOP/s/GPU): 66.5 | learning rate: 5.679084E-07 | global batch size: 8 | lm loss: 7.533233E-01 | loss scale: 1.0 | grad norm: 1.009 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 06:47:27] iteration 404/ 500 | consumed samples: 3232 | elapsed time per iteration (ms): 555287.0 | throughput per GPU (TFLOP/s/GPU): 93.7 | learning rate: 5.586219E-07 | global batch size: 8 | lm loss: 7.910761E-01 | loss scale: 1.0 | grad norm: 0.566 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 06:58:44] iteration 405/ 500 | consumed samples: 3240 | elapsed time per iteration (ms): 677427.3 | throughput per GPU (TFLOP/s/GPU): 76.8 | learning rate: 5.494190E-07 | global batch size: 8 | lm loss: 8.007294E-01 | loss scale: 1.0 | grad norm: 0.787 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure processed_samples 16900 unjoint_samples 16900 joint_samples 52 [95173, 1043931] processed_samples 16900 unjoint_samples 16900 joint_samples 51 [1046364, 46299] processed_samples 16900 unjoint_samples 16900 joint_samples 52 [1046496, 89101] processed_samples 16900 unjoint_samples 16900 joint_samples 50 [1046237, 563939] processed_samples 16900 unjoint_samples 16900 joint_samples 50 [1046562, 593415] processed_samples 16900 unjoint_samples 16900 joint_samples 51 [1046364, 46299] [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure processed_samples 16900 unjoint_samples 16900 joint_samples 50 [153945, 1030463] [h264 @ 0x55d51a586540] mmco: unref short failure processed_samples 16900 unjoint_samples 16900 joint_samples 50 [987809, 994613] processed_samples 16900 unjoint_samples 16900 joint_samples 50 [153945, 1030463] processed_samples 16900 unjoint_samples 16900 joint_samples 50 [983868, 1046503] processed_samples 16900 unjoint_samples 16900 joint_samples 50 [1046237, 563939] processed_samples 16900 unjoint_samples 16900 joint_samples 52 [95173, 1043931] processed_samples 16900 unjoint_samples 16900 joint_samples 52 [1046496, 89101] [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure processed_samples 16900 unjoint_samples 16900 joint_samples 50 [1046562, 593415] processed_samples 16900 unjoint_samples 16900 joint_samples 50 [983868, 1046503] processed_samples 16900 unjoint_samples 16900 joint_samples 50 [987809, 994613] [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [2024-12-01 07:10:25] iteration 406/ 500 | consumed samples: 3248 | elapsed time per iteration (ms): 701132.9 | throughput per GPU (TFLOP/s/GPU): 74.2 | learning rate: 5.403001E-07 | global batch size: 8 | lm loss: 8.500883E-01 | loss scale: 1.0 | grad norm: 0.643 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [2024-12-01 07:19:37] iteration 407/ 500 | consumed samples: 3256 | elapsed time per iteration (ms): 551624.2 | throughput per GPU (TFLOP/s/GPU): 94.3 | learning rate: 5.312654E-07 | global batch size: 8 | lm loss: 8.234965E-01 | loss scale: 1.0 | grad norm: 0.589 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d5133c8980] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure processed_samples 17000 unjoint_samples 17000 joint_samples 52 [1046496, 356297] processed_samples 17000 unjoint_samples 17000 joint_samples 50 [451472, 1030463] processed_samples 17000 unjoint_samples 17000 joint_samples 51 [1046539, 136773] processed_samples 17000 unjoint_samples 17000 joint_samples 51 [203326, 1046503] processed_samples 17000 unjoint_samples 17000 joint_samples 52 [343107, 1043931] processed_samples 17000 unjoint_samples 17000 joint_samples 51 [1046364, 514149] processed_samples 17000 unjoint_samples 17000 joint_samples 50 [1046562, 948074] [h264 @ 0x56215cc254c0] mmco: unref short failure processed_samples 17000 unjoint_samples 17000 joint_samples 50 [1046237, 898074] [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure processed_samples 17000 unjoint_samples 17000 joint_samples 51 [1046539, 136773] processed_samples 17000 unjoint_samples 17000 joint_samples 52 [1046496, 356297] processed_samples 17000 unjoint_samples 17000 joint_samples 51 [203326, 1046503] processed_samples 17000 unjoint_samples 17000 joint_samples 52 [343107, 1043931] processed_samples 17000 unjoint_samples 17000 joint_samples 51 [1046364, 514149] processed_samples 17000 unjoint_samples 17000 joint_samples 50 [1046237, 898074] processed_samples 17000 unjoint_samples 17000 joint_samples 50 [451472, 1030463] processed_samples 17000 unjoint_samples 17000 joint_samples 50 [1046562, 948074] [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215d072240] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-12-01 07:29:57] iteration 408/ 500 | consumed samples: 3264 | elapsed time per iteration (ms): 619408.4 | throughput per GPU (TFLOP/s/GPU): 84.0 | learning rate: 5.223155E-07 | global batch size: 8 | lm loss: 8.258877E-01 | loss scale: 1.0 | grad norm: 0.616 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [2024-12-01 07:39:06] iteration 409/ 500 | consumed samples: 3272 | elapsed time per iteration (ms): 549850.1 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 5.134507E-07 | global batch size: 8 | lm loss: 8.401117E-01 | loss scale: 1.0 | grad norm: 0.585 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 17100 unjoint_samples 17100 joint_samples 51 [1046539, 416250] processed_samples 17100 unjoint_samples 17100 joint_samples 50 [691117, 1030463] processed_samples 17100 unjoint_samples 17100 joint_samples 52 [650495, 1043931] [h264 @ 0x55d5145a1340] mmco: unref short failure processed_samples 17100 unjoint_samples 17100 joint_samples 51 [182361, 1046533] processed_samples 17100 unjoint_samples 17100 joint_samples 51 [124216, 1043853] processed_samples 17100 unjoint_samples 17100 joint_samples 51 [667616, 1046503] processed_samples 17100 unjoint_samples 17100 joint_samples 51 [1046364, 772920] [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure processed_samples 17100 unjoint_samples 17100 joint_samples 52 [1046496, 739044] [h264 @ 0x55d513765580] mmco: unref short failure processed_samples 17100 unjoint_samples 17100 joint_samples 51 [1046539, 416250] processed_samples 17100 unjoint_samples 17100 joint_samples 52 [650495, 1043931] processed_samples 17100 unjoint_samples 17100 joint_samples 51 [124216, 1043853] processed_samples 17100 unjoint_samples 17100 joint_samples 51 [182361, 1046533] processed_samples 17100 unjoint_samples 17100 joint_samples 51 [667616, 1046503] processed_samples 17100 unjoint_samples 17100 joint_samples 51 [1046364, 772920] processed_samples 17100 unjoint_samples 17100 joint_samples 52 [1046496, 739044] processed_samples 17100 unjoint_samples 17100 joint_samples 50 [691117, 1030463] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215f714500] mmco: unref short failure [h264 @ 0x56215f714500] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [2024-12-01 07:49:41] iteration 410/ 500 | consumed samples: 3280 | elapsed time per iteration (ms): 634908.6 | throughput per GPU (TFLOP/s/GPU): 82.0 | learning rate: 5.046713E-07 | global batch size: 8 | lm loss: 8.124939E-01 | loss scale: 1.0 | grad norm: 0.629 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 08:02:32] iteration 411/ 500 | consumed samples: 3288 | elapsed time per iteration (ms): 770692.2 | throughput per GPU (TFLOP/s/GPU): 67.5 | learning rate: 4.959777E-07 | global batch size: 8 | lm loss: 8.272680E-01 | loss scale: 1.0 | grad norm: 0.704 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure processed_samples 17200 unjoint_samples 17200 joint_samples 53 [1046496, 42043] processed_samples 17200 unjoint_samples 17200 joint_samples 52 [1046364, 81991] processed_samples 17200 unjoint_samples 17200 joint_samples 50 [1022791, 1030463] processed_samples 17200 unjoint_samples 17200 joint_samples 52 [1022425, 1043931] processed_samples 17200 unjoint_samples 17200 joint_samples 51 [512052, 1043853] [h264 @ 0x55d5144ae0c0] mmco: unref short failure processed_samples 17200 unjoint_samples 17200 joint_samples 51 [498091, 1046533] processed_samples 17200 unjoint_samples 17200 joint_samples 51 [945808, 1046503] processed_samples 17200 unjoint_samples 17200 joint_samples 51 [1046539, 847044] [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure processed_samples 17200 unjoint_samples 17200 joint_samples 52 [1046364, 81991] processed_samples 17200 unjoint_samples 17200 joint_samples 53 [1046496, 42043] processed_samples 17200 unjoint_samples 17200 joint_samples 51 [512052, 1043853] processed_samples 17200 unjoint_samples 17200 joint_samples 51 [498091, 1046533] processed_samples 17200 unjoint_samples 17200 joint_samples 51 [1046539, 847044] processed_samples 17200 unjoint_samples 17200 joint_samples 52 [1022425, 1043931] processed_samples 17200 unjoint_samples 17200 joint_samples 51 [945808, 1046503] processed_samples 17200 unjoint_samples 17200 joint_samples 50 [1022791, 1030463] [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [2024-12-01 08:13:28] iteration 412/ 500 | consumed samples: 3296 | elapsed time per iteration (ms): 656266.9 | throughput per GPU (TFLOP/s/GPU): 79.3 | learning rate: 4.873703E-07 | global batch size: 8 | lm loss: 8.331949E-01 | loss scale: 1.0 | grad norm: 0.743 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [2024-12-01 08:21:45] iteration 413/ 500 | consumed samples: 3304 | elapsed time per iteration (ms): 497136.0 | throughput per GPU (TFLOP/s/GPU): 104.7 | learning rate: 4.788494E-07 | global batch size: 8 | lm loss: 7.713867E-01 | loss scale: 1.0 | grad norm: 0.614 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [2024-12-01 08:32:02] iteration 414/ 500 | consumed samples: 3312 | elapsed time per iteration (ms): 616735.0 | throughput per GPU (TFLOP/s/GPU): 84.4 | learning rate: 4.704155E-07 | global batch size: 8 | lm loss: 7.833863E-01 | loss scale: 1.0 | grad norm: 0.689 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [2024-12-01 08:41:12] iteration 415/ 500 | consumed samples: 3320 | elapsed time per iteration (ms): 550190.2 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 4.620688E-07 | global batch size: 8 | lm loss: 7.809198E-01 | loss scale: 1.0 | grad norm: 0.677 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x55d51aa06d80] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure processed_samples 17300 unjoint_samples 17300 joint_samples 52 [254398, 1024488] processed_samples 17300 unjoint_samples 17300 joint_samples 53 [1046496, 325096] processed_samples 17300 unjoint_samples 17300 joint_samples 51 [1046874, 224911] processed_samples 17300 unjoint_samples 17300 joint_samples 52 [1046254, 251953] processed_samples 17300 unjoint_samples 17300 joint_samples 52 [1046364, 424804] processed_samples 17300 unjoint_samples 17300 joint_samples 53 [1041886, 329927] [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure processed_samples 17300 unjoint_samples 17300 joint_samples 51 [957816, 1046533] processed_samples 17300 unjoint_samples 17300 joint_samples 51 [944115, 1043853] [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure processed_samples 17300 unjoint_samples 17300 joint_samples 52 [254398, 1024488] processed_samples 17300 unjoint_samples 17300 joint_samples 51 [1046874, 224911] processed_samples 17300 unjoint_samples 17300 joint_samples 53 [1046496, 325096] processed_samples 17300 unjoint_samples 17300 joint_samples 52 [1046254, 251953] processed_samples 17300 unjoint_samples 17300 joint_samples 53 [1041886, 329927] processed_samples 17300 unjoint_samples 17300 joint_samples 52 [1046364, 424804] processed_samples 17300 unjoint_samples 17300 joint_samples 51 [957816, 1046533] [h264 @ 0x55d5145a1340] mmco: unref short failure processed_samples 17300 unjoint_samples 17300 joint_samples 51 [944115, 1043853] [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [2024-12-01 08:49:28] iteration 416/ 500 | consumed samples: 3328 | elapsed time per iteration (ms): 495207.7 | throughput per GPU (TFLOP/s/GPU): 105.1 | learning rate: 4.538097E-07 | global batch size: 8 | lm loss: 8.112941E-01 | loss scale: 1.0 | grad norm: 0.704 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [2024-12-01 08:57:17] iteration 417/ 500 | consumed samples: 3336 | elapsed time per iteration (ms): 469064.3 | throughput per GPU (TFLOP/s/GPU): 110.9 | learning rate: 4.456385E-07 | global batch size: 8 | lm loss: 7.586689E-01 | loss scale: 1.0 | grad norm: 0.548 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x56215b271600] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure processed_samples 17400 unjoint_samples 17400 joint_samples 51 [1046874, 616398] processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1046364, 851077] processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1047235, 363744] processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1045065, 210144] processed_samples 17400 unjoint_samples 17400 joint_samples 53 [1041886, 758627] processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1046254, 758247] processed_samples 17400 unjoint_samples 17400 joint_samples 53 [1046496, 614136] processed_samples 17400 unjoint_samples 17400 joint_samples 52 [540046, 1024488] [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f714500] mmco: unref short failure processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1047235, 363744] processed_samples 17400 unjoint_samples 17400 joint_samples 51 [1046874, 616398] processed_samples 17400 unjoint_samples 17400 joint_samples 52 [540046, 1024488] processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1045065, 210144] processed_samples 17400 unjoint_samples 17400 joint_samples 53 [1041886, 758627] processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1046364, 851077] processed_samples 17400 unjoint_samples 17400 joint_samples 53 [1046496, 614136] processed_samples 17400 unjoint_samples 17400 joint_samples 52 [1046254, 758247] [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [2024-12-01 09:07:39] iteration 418/ 500 | consumed samples: 3344 | elapsed time per iteration (ms): 622731.4 | throughput per GPU (TFLOP/s/GPU): 83.6 | learning rate: 4.375557E-07 | global batch size: 8 | lm loss: 8.728430E-01 | loss scale: 1.0 | grad norm: 0.695 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x56215c627e80] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5168cfe40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [2024-12-01 09:18:24] iteration 419/ 500 | consumed samples: 3352 | elapsed time per iteration (ms): 644159.3 | throughput per GPU (TFLOP/s/GPU): 80.8 | learning rate: 4.295615E-07 | global batch size: 8 | lm loss: 8.373601E-01 | loss scale: 1.0 | grad norm: 0.924 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [2024-12-01 09:27:56] iteration 420/ 500 | consumed samples: 3360 | elapsed time per iteration (ms): 572431.4 | throughput per GPU (TFLOP/s/GPU): 90.9 | learning rate: 4.216562E-07 | global batch size: 8 | lm loss: 7.823456E-01 | loss scale: 1.0 | grad norm: 0.607 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (259402.89, 259403.23) [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure processed_samples 17500 unjoint_samples 17500 joint_samples 54 [60450, 1019283] processed_samples 17500 unjoint_samples 17500 joint_samples 52 [1047235, 700229] processed_samples 17500 unjoint_samples 17500 joint_samples 53 [1046364, 138426] processed_samples 17500 unjoint_samples 17500 joint_samples 53 [19392, 1046811] processed_samples 17500 unjoint_samples 17500 joint_samples 52 [1045065, 565351] processed_samples 17500 unjoint_samples 17500 joint_samples 52 [1047235, 700229] processed_samples 17500 unjoint_samples 17500 joint_samples 54 [60450, 1019283] processed_samples 17500 unjoint_samples 17500 joint_samples 52 [784006, 1024488] processed_samples 17500 unjoint_samples 17500 joint_samples 53 [1046364, 138426] processed_samples 17500 unjoint_samples 17500 joint_samples 51 [1046874, 929482] processed_samples 17500 unjoint_samples 17500 joint_samples 52 [1045065, 565351] processed_samples 17500 unjoint_samples 17500 joint_samples 52 [784006, 1024488] processed_samples 17500 unjoint_samples 17500 joint_samples 53 [19392, 1046811] processed_samples 17500 unjoint_samples 17500 joint_samples 53 [1046496, 962646] processed_samples 17500 unjoint_samples 17500 joint_samples 53 [1046496, 962646] processed_samples 17500 unjoint_samples 17500 joint_samples 51 [1046874, 929482] [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [2024-12-01 09:41:18] iteration 421/ 500 | consumed samples: 3368 | elapsed time per iteration (ms): 542009.0 | throughput per GPU (TFLOP/s/GPU): 96.0 | learning rate: 4.138403E-07 | global batch size: 8 | lm loss: 8.172854E-01 | loss scale: 1.0 | grad norm: 0.735 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [2024-12-01 09:51:58] iteration 422/ 500 | consumed samples: 3376 | elapsed time per iteration (ms): 640585.4 | throughput per GPU (TFLOP/s/GPU): 81.2 | learning rate: 4.061140E-07 | global batch size: 8 | lm loss: 8.059546E-01 | loss scale: 1.0 | grad norm: 0.653 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f714500] mmco: unref short failure [h264 @ 0x55d5168cfe40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1046874, 188377] processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1045065, 800487] processed_samples 17600 unjoint_samples 17600 joint_samples 53 [1046364, 427102] processed_samples 17600 unjoint_samples 17600 joint_samples 54 [1046910, 200641] processed_samples 17600 unjoint_samples 17600 joint_samples 53 [308994, 1046811] [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1002135, 1024488] processed_samples 17600 unjoint_samples 17600 joint_samples 54 [437610, 1019283] processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1046874, 188377] processed_samples 17600 unjoint_samples 17600 joint_samples 54 [1046910, 200641] processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1047235, 983555] processed_samples 17600 unjoint_samples 17600 joint_samples 53 [308994, 1046811] processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1045065, 800487] processed_samples 17600 unjoint_samples 17600 joint_samples 53 [1046364, 427102] [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1047235, 983555] processed_samples 17600 unjoint_samples 17600 joint_samples 54 [437610, 1019283] processed_samples 17600 unjoint_samples 17600 joint_samples 52 [1002135, 1024488] [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [2024-12-01 10:00:49] iteration 423/ 500 | consumed samples: 3384 | elapsed time per iteration (ms): 530376.6 | throughput per GPU (TFLOP/s/GPU): 98.1 | learning rate: 3.984776E-07 | global batch size: 8 | lm loss: 8.025057E-01 | loss scale: 1.0 | grad norm: 1.568 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b928840] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [2024-12-01 10:10:12] iteration 424/ 500 | consumed samples: 3392 | elapsed time per iteration (ms): 563357.7 | throughput per GPU (TFLOP/s/GPU): 92.4 | learning rate: 3.909315E-07 | global batch size: 8 | lm loss: 8.046057E-01 | loss scale: 1.0 | grad norm: 0.828 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-12-01 10:18:09] iteration 425/ 500 | consumed samples: 3400 | elapsed time per iteration (ms): 477347.5 | throughput per GPU (TFLOP/s/GPU): 109.0 | learning rate: 3.834760E-07 | global batch size: 8 | lm loss: 8.117014E-01 | loss scale: 1.0 | grad norm: 0.497 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215b70be40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure processed_samples 17700 unjoint_samples 17700 joint_samples 52 [1046874, 504009] processed_samples 17700 unjoint_samples 17700 joint_samples 53 [1046364, 735773] [h264 @ 0x56215b523440] mmco: unref short failure processed_samples 17700 unjoint_samples 17700 joint_samples 53 [196340, 1047550] processed_samples 17700 unjoint_samples 17700 joint_samples 53 [349109, 921903] processed_samples 17700 unjoint_samples 17700 joint_samples 53 [220594, 1037240] processed_samples 17700 unjoint_samples 17700 joint_samples 53 [700155, 1046811] processed_samples 17700 unjoint_samples 17700 joint_samples 54 [1046910, 463930] processed_samples 17700 unjoint_samples 17700 joint_samples 52 [1046874, 504009] processed_samples 17700 unjoint_samples 17700 joint_samples 53 [1046364, 735773] processed_samples 17700 unjoint_samples 17700 joint_samples 54 [1046910, 463930] processed_samples 17700 unjoint_samples 17700 joint_samples 53 [196340, 1047550] processed_samples 17700 unjoint_samples 17700 joint_samples 53 [220594, 1037240] processed_samples 17700 unjoint_samples 17700 joint_samples 53 [349109, 921903] processed_samples 17700 unjoint_samples 17700 joint_samples 54 [747400, 1019283] processed_samples 17700 unjoint_samples 17700 joint_samples 53 [700155, 1046811] processed_samples 17700 unjoint_samples 17700 joint_samples 54 [747400, 1019283] [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215b100780] mmco: unref short failure [h264 @ 0x56215b100780] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215b100780] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d5178a9980] mmco: unref short failure [h264 @ 0x55d5178a9980] mmco: unref short failure [h264 @ 0x55d5178a9980] mmco: unref short failure [h264 @ 0x55d5178a9980] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure processed_samples 17800 unjoint_samples 17800 joint_samples 54 [38127, 1039929] processed_samples 17800 unjoint_samples 17800 joint_samples 53 [478442, 1037240] processed_samples 17800 unjoint_samples 17800 joint_samples 52 [1046874, 851580] processed_samples 17800 unjoint_samples 17800 joint_samples 53 [440619, 1047550] processed_samples 17800 unjoint_samples 17800 joint_samples 54 [1046910, 980262] processed_samples 17800 unjoint_samples 17800 joint_samples 53 [609333, 921903] processed_samples 17800 unjoint_samples 17800 joint_samples 53 [1024371, 1046811] processed_samples 17800 unjoint_samples 17800 joint_samples 54 [1014725, 1019283] [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure processed_samples 17800 unjoint_samples 17800 joint_samples 52 [1046874, 851580] processed_samples 17800 unjoint_samples 17800 joint_samples 53 [440619, 1047550] processed_samples 17800 unjoint_samples 17800 joint_samples 53 [478442, 1037240] processed_samples 17800 unjoint_samples 17800 joint_samples 54 [1014725, 1019283] processed_samples 17800 unjoint_samples 17800 joint_samples 53 [1024371, 1046811] processed_samples 17800 unjoint_samples 17800 joint_samples 54 [1046910, 980262] processed_samples 17800 unjoint_samples 17800 joint_samples 53 [609333, 921903] processed_samples 17800 unjoint_samples 17800 joint_samples 54 [38127, 1039929] [2024-12-01 10:29:57] iteration 426/ 500 | consumed samples: 3408 | elapsed time per iteration (ms): 708209.6 | throughput per GPU (TFLOP/s/GPU): 73.5 | learning rate: 3.761115E-07 | global batch size: 8 | lm loss: 7.919018E-01 | loss scale: 1.0 | grad norm: 0.506 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [2024-12-01 10:40:57] iteration 427/ 500 | consumed samples: 3416 | elapsed time per iteration (ms): 659879.3 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 3.688381E-07 | global batch size: 8 | lm loss: 8.052567E-01 | loss scale: 1.0 | grad norm: 1.120 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d51327b000] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [2024-12-01 10:49:45] iteration 428/ 500 | consumed samples: 3424 | elapsed time per iteration (ms): 527278.1 | throughput per GPU (TFLOP/s/GPU): 98.7 | learning rate: 3.616562E-07 | global batch size: 8 | lm loss: 8.389349E-01 | loss scale: 1.0 | grad norm: 0.741 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 10:57:42] iteration 429/ 500 | consumed samples: 3432 | elapsed time per iteration (ms): 476971.2 | throughput per GPU (TFLOP/s/GPU): 109.1 | learning rate: 3.545662E-07 | global batch size: 8 | lm loss: 7.981853E-01 | loss scale: 1.0 | grad norm: 0.759 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [2024-12-01 11:07:54] iteration 430/ 500 | consumed samples: 3440 | elapsed time per iteration (ms): 612005.4 | throughput per GPU (TFLOP/s/GPU): 85.0 | learning rate: 3.475682E-07 | global batch size: 8 | lm loss: 8.206917E-01 | loss scale: 1.0 | grad norm: 0.595 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x55d516c18800] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure processed_samples 17900 unjoint_samples 17900 joint_samples 55 [244587, 1041944] processed_samples 17900 unjoint_samples 17900 joint_samples 53 [117114, 1043785] processed_samples 17900 unjoint_samples 17900 joint_samples 53 [765583, 1047550] processed_samples 17900 unjoint_samples 17900 joint_samples 54 [358824, 1039929] [h264 @ 0x56215f6b5640] mmco: unref short failure processed_samples 17900 unjoint_samples 17900 joint_samples 55 [1036890, 387692] processed_samples 17900 unjoint_samples 17900 joint_samples 54 [220760, 1046811] processed_samples 17900 unjoint_samples 17900 joint_samples 53 [784032, 1037240] [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 17900 unjoint_samples 17900 joint_samples 53 [997954, 937444] [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure processed_samples 17900 unjoint_samples 17900 joint_samples 55 [244587, 1041944] processed_samples 17900 unjoint_samples 17900 joint_samples 53 [117114, 1043785] [h264 @ 0x56215b75cc40] mmco: unref short failure processed_samples 17900 unjoint_samples 17900 joint_samples 53 [765583, 1047550] processed_samples 17900 unjoint_samples 17900 joint_samples 54 [358824, 1039929] [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 17900 unjoint_samples 17900 joint_samples 55 [1036890, 387692] processed_samples 17900 unjoint_samples 17900 joint_samples 54 [220760, 1046811] processed_samples 17900 unjoint_samples 17900 joint_samples 53 [784032, 1037240] processed_samples 17900 unjoint_samples 17900 joint_samples 53 [997954, 937444] [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [2024-12-01 11:17:44] iteration 431/ 500 | consumed samples: 3448 | elapsed time per iteration (ms): 590797.5 | throughput per GPU (TFLOP/s/GPU): 88.1 | learning rate: 3.406627E-07 | global batch size: 8 | lm loss: 8.949883E-01 | loss scale: 1.0 | grad norm: 0.689 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c3b4780] mmco: unref short failure [h264 @ 0x56215c3b4780] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [2024-12-01 11:27:48] iteration 432/ 500 | consumed samples: 3456 | elapsed time per iteration (ms): 603113.4 | throughput per GPU (TFLOP/s/GPU): 86.3 | learning rate: 3.338499E-07 | global batch size: 8 | lm loss: 7.817925E-01 | loss scale: 1.0 | grad norm: 0.671 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure processed_samples 18000 unjoint_samples 18000 joint_samples 53 [440767, 1043785] [h264 @ 0x55d514594d00] mmco: unref short failure processed_samples 18000 unjoint_samples 18000 joint_samples 54 [1037696, 157113] processed_samples 18000 unjoint_samples 18000 joint_samples 54 [52930, 1047168] processed_samples 18000 unjoint_samples 18000 joint_samples 55 [590070, 1041944] processed_samples 18000 unjoint_samples 18000 joint_samples 54 [525706, 1046811] processed_samples 18000 unjoint_samples 18000 joint_samples 55 [1036890, 750150] processed_samples 18000 unjoint_samples 18000 joint_samples 54 [672333, 1039929] processed_samples 18000 unjoint_samples 18000 joint_samples 53 [440767, 1043785] processed_samples 18000 unjoint_samples 18000 joint_samples 54 [672333, 1039929] processed_samples 18000 unjoint_samples 18000 joint_samples 54 [1037696, 157113] processed_samples 18000 unjoint_samples 18000 joint_samples 54 [1047402, 4518] processed_samples 18000 unjoint_samples 18000 joint_samples 54 [52930, 1047168] processed_samples 18000 unjoint_samples 18000 joint_samples 54 [525706, 1046811] processed_samples 18000 unjoint_samples 18000 joint_samples 55 [590070, 1041944] processed_samples 18000 unjoint_samples 18000 joint_samples 55 [1036890, 750150] processed_samples 18000 unjoint_samples 18000 joint_samples 54 [1047402, 4518] [2024-12-01 11:34:52] iteration 433/ 500 | consumed samples: 3464 | elapsed time per iteration (ms): 423893.8 | throughput per GPU (TFLOP/s/GPU): 122.8 | learning rate: 3.271301E-07 | global batch size: 8 | lm loss: 7.689373E-01 | loss scale: 1.0 | grad norm: 0.666 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x55d5170b6cc0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x55d5168d06c0] mmco: unref short failure [h264 @ 0x56215f714500] mmco: unref short failure [h264 @ 0x56215f714500] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5169efdc0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f714500] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure processed_samples 18100 unjoint_samples 18100 joint_samples 54 [337102, 1047168] processed_samples 18100 unjoint_samples 18100 joint_samples 54 [1047402, 331027] [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure processed_samples 18100 unjoint_samples 18100 joint_samples 54 [1037696, 404133] processed_samples 18100 unjoint_samples 18100 joint_samples 54 [792931, 1046811] [h264 @ 0x55d5149ada40] mmco: unref short failure processed_samples 18100 unjoint_samples 18100 joint_samples 55 [1040475, 1040736] processed_samples 18100 unjoint_samples 18100 joint_samples 55 [1045631, 3389] processed_samples 18100 unjoint_samples 18100 joint_samples 55 [896076, 1041944] processed_samples 18100 unjoint_samples 18100 joint_samples 53 [696959, 1043785] [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 18100 unjoint_samples 18100 joint_samples 53 [696959, 1043785] processed_samples 18100 unjoint_samples 18100 joint_samples 54 [337102, 1047168] processed_samples 18100 unjoint_samples 18100 joint_samples 54 [1047402, 331027] processed_samples 18100 unjoint_samples 18100 joint_samples 54 [1037696, 404133] processed_samples 18100 unjoint_samples 18100 joint_samples 55 [896076, 1041944] [h264 @ 0x56215ccdcdc0] mmco: unref short failure processed_samples 18100 unjoint_samples 18100 joint_samples 55 [1040475, 1040736] processed_samples 18100 unjoint_samples 18100 joint_samples 54 [792931, 1046811] processed_samples 18100 unjoint_samples 18100 joint_samples 55 [1045631, 3389] [2024-12-01 11:45:05] iteration 434/ 500 | consumed samples: 3472 | elapsed time per iteration (ms): 613843.4 | throughput per GPU (TFLOP/s/GPU): 84.8 | learning rate: 3.205035E-07 | global batch size: 8 | lm loss: 7.734962E-01 | loss scale: 1.0 | grad norm: 0.539 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [2024-12-01 11:56:07] iteration 435/ 500 | consumed samples: 3480 | elapsed time per iteration (ms): 661397.0 | throughput per GPU (TFLOP/s/GPU): 78.7 | learning rate: 3.139705E-07 | global batch size: 8 | lm loss: 8.416315E-01 | loss scale: 1.0 | grad norm: 0.613 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215f4fa380] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-12-01 12:05:09] iteration 436/ 500 | consumed samples: 3488 | elapsed time per iteration (ms): 542389.6 | throughput per GPU (TFLOP/s/GPU): 95.9 | learning rate: 3.075313E-07 | global batch size: 8 | lm loss: 7.784573E-01 | loss scale: 1.0 | grad norm: 0.670 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [2024-12-01 12:17:23] iteration 437/ 500 | consumed samples: 3496 | elapsed time per iteration (ms): 733435.4 | throughput per GPU (TFLOP/s/GPU): 71.0 | learning rate: 3.011862E-07 | global batch size: 8 | lm loss: 7.929318E-01 | loss scale: 1.0 | grad norm: 0.814 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d512fe9900] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure processed_samples 18200 unjoint_samples 18200 joint_samples 54 [1037696, 787783] processed_samples 18200 unjoint_samples 18200 joint_samples 56 [138085, 1046744] processed_samples 18200 unjoint_samples 18200 joint_samples 56 [261580, 1047141] [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure processed_samples 18200 unjoint_samples 18200 joint_samples 55 [1045631, 237211] processed_samples 18200 unjoint_samples 18200 joint_samples 54 [649075, 1047168] processed_samples 18200 unjoint_samples 18200 joint_samples 55 [1023804, 113039] processed_samples 18200 unjoint_samples 18200 joint_samples 54 [1047402, 646699] processed_samples 18200 unjoint_samples 18200 joint_samples 53 [983067, 1043785] [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b928840] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure processed_samples 18200 unjoint_samples 18200 joint_samples 56 [261580, 1047141] [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure processed_samples 18200 unjoint_samples 18200 joint_samples 56 [138085, 1046744] processed_samples 18200 unjoint_samples 18200 joint_samples 55 [1023804, 113039] processed_samples 18200 unjoint_samples 18200 joint_samples 55 [1045631, 237211] processed_samples 18200 unjoint_samples 18200 joint_samples 54 [1037696, 787783] processed_samples 18200 unjoint_samples 18200 joint_samples 54 [649075, 1047168] processed_samples 18200 unjoint_samples 18200 joint_samples 54 [1047402, 646699] processed_samples 18200 unjoint_samples 18200 joint_samples 53 [983067, 1043785] [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [2024-12-01 12:27:08] iteration 438/ 500 | consumed samples: 3504 | elapsed time per iteration (ms): 585613.9 | throughput per GPU (TFLOP/s/GPU): 88.9 | learning rate: 2.949354E-07 | global batch size: 8 | lm loss: 8.072410E-01 | loss scale: 1.0 | grad norm: 0.638 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [2024-12-01 12:37:04] iteration 439/ 500 | consumed samples: 3512 | elapsed time per iteration (ms): 596157.9 | throughput per GPU (TFLOP/s/GPU): 87.3 | learning rate: 2.887793E-07 | global batch size: 8 | lm loss: 8.036357E-01 | loss scale: 1.0 | grad norm: 0.636 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure processed_samples 18300 unjoint_samples 18300 joint_samples 54 [1047402, 907297] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 18300 unjoint_samples 18300 joint_samples 55 [103075, 1019416] processed_samples 18300 unjoint_samples 18300 joint_samples 55 [1023804, 370838] processed_samples 18300 unjoint_samples 18300 joint_samples 54 [290617, 1046042] processed_samples 18300 unjoint_samples 18300 joint_samples 55 [1045631, 490566] processed_samples 18300 unjoint_samples 18300 joint_samples 56 [635398, 1047141] [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure processed_samples 18300 unjoint_samples 18300 joint_samples 56 [444978, 1046744] processed_samples 18300 unjoint_samples 18300 joint_samples 54 [1047402, 907297] processed_samples 18300 unjoint_samples 18300 joint_samples 55 [103075, 1019416] [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure processed_samples 18300 unjoint_samples 18300 joint_samples 54 [290617, 1046042] processed_samples 18300 unjoint_samples 18300 joint_samples 55 [1023804, 370838] processed_samples 18300 unjoint_samples 18300 joint_samples 54 [941634, 1047168] processed_samples 18300 unjoint_samples 18300 joint_samples 56 [444978, 1046744] processed_samples 18300 unjoint_samples 18300 joint_samples 55 [1045631, 490566] [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure processed_samples 18300 unjoint_samples 18300 joint_samples 56 [635398, 1047141] [h264 @ 0x55d51314e5c0] mmco: unref short failure processed_samples 18300 unjoint_samples 18300 joint_samples 54 [941634, 1047168] [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [2024-12-01 12:47:49] iteration 440/ 500 | consumed samples: 3520 | elapsed time per iteration (ms): 644450.1 | throughput per GPU (TFLOP/s/GPU): 80.8 | learning rate: 2.827180E-07 | global batch size: 8 | lm loss: 8.524545E-01 | loss scale: 1.0 | grad norm: 0.960 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (149544.11, 149544.48) [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [2024-12-01 12:58:26] iteration 441/ 500 | consumed samples: 3528 | elapsed time per iteration (ms): 487681.0 | throughput per GPU (TFLOP/s/GPU): 106.7 | learning rate: 2.767519E-07 | global batch size: 8 | lm loss: 8.442378E-01 | loss scale: 1.0 | grad norm: 0.576 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d516c21040] Missing reference picture, default is 65530 [h264 @ 0x55d516c21040] Missing reference picture, default is 65530 [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d733d80] Missing reference picture, default is 65530 [h264 @ 0x56215d733d80] Missing reference picture, default is 65530 [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1047402, 146642] processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1047402, 146642] processed_samples 18400 unjoint_samples 18400 joint_samples 55 [362857, 1019416] processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1045631, 844468] processed_samples 18400 unjoint_samples 18400 joint_samples 56 [919411, 1047141] processed_samples 18400 unjoint_samples 18400 joint_samples 55 [995693, 286355] processed_samples 18400 unjoint_samples 18400 joint_samples 54 [574148, 1046042] processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1023804, 651751] [h264 @ 0x55d51b2c01c0] mmco: unref short failure processed_samples 18400 unjoint_samples 18400 joint_samples 56 [692029, 1046744] processed_samples 18400 unjoint_samples 18400 joint_samples 54 [574148, 1046042] processed_samples 18400 unjoint_samples 18400 joint_samples 55 [362857, 1019416] processed_samples 18400 unjoint_samples 18400 joint_samples 55 [995693, 286355] processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1023804, 651751] processed_samples 18400 unjoint_samples 18400 joint_samples 56 [692029, 1046744] processed_samples 18400 unjoint_samples 18400 joint_samples 55 [1045631, 844468] [h264 @ 0x56215b17e600] mmco: unref short failure processed_samples 18400 unjoint_samples 18400 joint_samples 56 [919411, 1047141] [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [2024-12-01 13:09:14] iteration 442/ 500 | consumed samples: 3536 | elapsed time per iteration (ms): 647586.4 | throughput per GPU (TFLOP/s/GPU): 80.4 | learning rate: 2.708811E-07 | global batch size: 8 | lm loss: 8.337551E-01 | loss scale: 1.0 | grad norm: 0.604 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [2024-12-01 13:18:43] iteration 443/ 500 | consumed samples: 3544 | elapsed time per iteration (ms): 568784.5 | throughput per GPU (TFLOP/s/GPU): 91.5 | learning rate: 2.651060E-07 | global batch size: 8 | lm loss: 8.120881E-01 | loss scale: 1.0 | grad norm: 0.553 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5140bdd40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure processed_samples 18500 unjoint_samples 18500 joint_samples 56 [31916, 1044722] processed_samples 18500 unjoint_samples 18500 joint_samples 56 [31916, 1044722] [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure processed_samples 18500 unjoint_samples 18500 joint_samples 55 [1047402, 518299] processed_samples 18500 unjoint_samples 18500 joint_samples 55 [1047402, 518299] processed_samples 18500 unjoint_samples 18500 joint_samples 54 [900010, 1046042] processed_samples 18500 unjoint_samples 18500 joint_samples 55 [640036, 1019416] processed_samples 18500 unjoint_samples 18500 joint_samples 57 [134414, 1047141] processed_samples 18500 unjoint_samples 18500 joint_samples 54 [900010, 1046042] processed_samples 18500 unjoint_samples 18500 joint_samples 55 [1023804, 1012752] processed_samples 18500 unjoint_samples 18500 joint_samples 55 [640036, 1019416] processed_samples 18500 unjoint_samples 18500 joint_samples 55 [1023804, 1012752] processed_samples 18500 unjoint_samples 18500 joint_samples 57 [134414, 1047141] processed_samples 18500 unjoint_samples 18500 joint_samples 55 [995693, 510802] processed_samples 18500 unjoint_samples 18500 joint_samples 55 [995693, 510802] [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure processed_samples 18500 unjoint_samples 18500 joint_samples 56 [943092, 1046744] processed_samples 18500 unjoint_samples 18500 joint_samples 56 [943092, 1046744] [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215b453c80] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [2024-12-01 13:30:27] iteration 444/ 500 | consumed samples: 3552 | elapsed time per iteration (ms): 704704.3 | throughput per GPU (TFLOP/s/GPU): 73.8 | learning rate: 2.594267E-07 | global batch size: 8 | lm loss: 8.168926E-01 | loss scale: 1.0 | grad norm: 0.650 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d5140b3b40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [2024-12-01 13:41:06] iteration 445/ 500 | consumed samples: 3560 | elapsed time per iteration (ms): 638414.9 | throughput per GPU (TFLOP/s/GPU): 81.5 | learning rate: 2.538436E-07 | global batch size: 8 | lm loss: 8.196800E-01 | loss scale: 1.0 | grad norm: 0.853 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [2024-12-01 13:50:59] iteration 446/ 500 | consumed samples: 3568 | elapsed time per iteration (ms): 593547.3 | throughput per GPU (TFLOP/s/GPU): 87.7 | learning rate: 2.483568E-07 | global batch size: 8 | lm loss: 8.253284E-01 | loss scale: 1.0 | grad norm: 0.566 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure processed_samples 18600 unjoint_samples 18600 joint_samples 55 [118948, 1046602] processed_samples 18600 unjoint_samples 18600 joint_samples 56 [280311, 1044722] processed_samples 18600 unjoint_samples 18600 joint_samples 57 [1026406, 248733] processed_samples 18600 unjoint_samples 18600 joint_samples 55 [1047402, 815598] processed_samples 18600 unjoint_samples 18600 joint_samples 57 [555359, 1047141] [h264 @ 0x56215c8ed2c0] mmco: unref short failure processed_samples 18600 unjoint_samples 18600 joint_samples 55 [980454, 1019416] processed_samples 18600 unjoint_samples 18600 joint_samples 56 [298276, 1041104] processed_samples 18600 unjoint_samples 18600 joint_samples 55 [118948, 1046602] processed_samples 18600 unjoint_samples 18600 joint_samples 55 [995693, 815666] processed_samples 18600 unjoint_samples 18600 joint_samples 56 [280311, 1044722] processed_samples 18600 unjoint_samples 18600 joint_samples 57 [1026406, 248733] processed_samples 18600 unjoint_samples 18600 joint_samples 55 [1047402, 815598] processed_samples 18600 unjoint_samples 18600 joint_samples 57 [555359, 1047141] processed_samples 18600 unjoint_samples 18600 joint_samples 55 [980454, 1019416] processed_samples 18600 unjoint_samples 18600 joint_samples 56 [298276, 1041104] [h264 @ 0x55d514426700] mmco: unref short failure processed_samples 18600 unjoint_samples 18600 joint_samples 55 [995693, 815666] [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [2024-12-01 13:59:36] iteration 447/ 500 | consumed samples: 3576 | elapsed time per iteration (ms): 517153.1 | throughput per GPU (TFLOP/s/GPU): 100.6 | learning rate: 2.429665E-07 | global batch size: 8 | lm loss: 7.605423E-01 | loss scale: 1.0 | grad norm: 0.673 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [2024-12-01 14:09:35] iteration 448/ 500 | consumed samples: 3584 | elapsed time per iteration (ms): 598084.5 | throughput per GPU (TFLOP/s/GPU): 87.0 | learning rate: 2.376731E-07 | global batch size: 8 | lm loss: 7.981852E-01 | loss scale: 1.0 | grad norm: 0.653 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure processed_samples 18700 unjoint_samples 18700 joint_samples 57 [1026406, 551230] [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure processed_samples 18700 unjoint_samples 18700 joint_samples 57 [1026406, 551230] processed_samples 18700 unjoint_samples 18700 joint_samples 56 [253149, 1047183] processed_samples 18700 unjoint_samples 18700 joint_samples 56 [253149, 1047183] processed_samples 18700 unjoint_samples 18700 joint_samples 56 [25383, 1041705] processed_samples 18700 unjoint_samples 18700 joint_samples 56 [25383, 1041705] processed_samples 18700 unjoint_samples 18700 joint_samples 55 [386254, 1046602] processed_samples 18700 unjoint_samples 18700 joint_samples 55 [386254, 1046602] processed_samples 18700 unjoint_samples 18700 joint_samples 56 [735119, 1044722] processed_samples 18700 unjoint_samples 18700 joint_samples 56 [735119, 1044722] processed_samples 18700 unjoint_samples 18700 joint_samples 57 [767829, 1047141] processed_samples 18700 unjoint_samples 18700 joint_samples 56 [788303, 1041104] processed_samples 18700 unjoint_samples 18700 joint_samples 57 [767829, 1047141] processed_samples 18700 unjoint_samples 18700 joint_samples 56 [788303, 1041104] processed_samples 18700 unjoint_samples 18700 joint_samples 55 [1033694, 1034262] [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure processed_samples 18700 unjoint_samples 18700 joint_samples 55 [1033694, 1034262] [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [2024-12-01 14:19:52] iteration 449/ 500 | consumed samples: 3592 | elapsed time per iteration (ms): 617372.2 | throughput per GPU (TFLOP/s/GPU): 84.3 | learning rate: 2.324767E-07 | global batch size: 8 | lm loss: 8.294312E-01 | loss scale: 1.0 | grad norm: 0.597 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x56215f0f3d40] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x56215af7d040] mmco: unref short failure [h264 @ 0x55d51a2c2780] mmco: unref short failure [h264 @ 0x55d51a2c2780] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [2024-12-01 14:27:43] iteration 450/ 500 | consumed samples: 3600 | elapsed time per iteration (ms): 470732.4 | throughput per GPU (TFLOP/s/GPU): 110.6 | learning rate: 2.273775E-07 | global batch size: 8 | lm loss: 7.941829E-01 | loss scale: 1.0 | grad norm: 0.700 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51a2c2780] mmco: unref short failure [2024-12-01 14:40:02] iteration 451/ 500 | consumed samples: 3608 | elapsed time per iteration (ms): 739733.5 | throughput per GPU (TFLOP/s/GPU): 70.4 | learning rate: 2.223758E-07 | global batch size: 8 | lm loss: 8.482308E-01 | loss scale: 1.0 | grad norm: 0.898 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1046793, 55820] processed_samples 18800 unjoint_samples 18800 joint_samples 55 [792957, 1046602] processed_samples 18800 unjoint_samples 18800 joint_samples 55 [792957, 1046602] processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1046793, 55820] processed_samples 18800 unjoint_samples 18800 joint_samples 56 [260160, 1041705] processed_samples 18800 unjoint_samples 18800 joint_samples 56 [1038546, 315206] processed_samples 18800 unjoint_samples 18800 joint_samples 56 [260160, 1041705] processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1026406, 971244] processed_samples 18800 unjoint_samples 18800 joint_samples 56 [1038546, 315206] processed_samples 18800 unjoint_samples 18800 joint_samples 56 [472463, 1047183] processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1047210, 144209] processed_samples 18800 unjoint_samples 18800 joint_samples 56 [472463, 1047183] processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1047210, 144209] processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1026406, 971244] processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1031035, 1047141] processed_samples 18800 unjoint_samples 18800 joint_samples 57 [1031035, 1047141] [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x56215b879700] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215ba5b0c0] mmco: unref short failure [2024-12-01 14:48:07] iteration 452/ 500 | consumed samples: 3616 | elapsed time per iteration (ms): 484888.6 | throughput per GPU (TFLOP/s/GPU): 107.3 | learning rate: 2.174717E-07 | global batch size: 8 | lm loss: 7.795358E-01 | loss scale: 1.0 | grad norm: 0.744 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [2024-12-01 14:55:46] iteration 453/ 500 | consumed samples: 3624 | elapsed time per iteration (ms): 458864.6 | throughput per GPU (TFLOP/s/GPU): 113.4 | learning rate: 2.126655E-07 | global batch size: 8 | lm loss: 7.805303E-01 | loss scale: 1.0 | grad norm: 0.842 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x562163d38640] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x56215de0a2c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [2024-12-01 15:05:07] iteration 454/ 500 | consumed samples: 3632 | elapsed time per iteration (ms): 560659.6 | throughput per GPU (TFLOP/s/GPU): 92.8 | learning rate: 2.079574E-07 | global batch size: 8 | lm loss: 8.343635E-01 | loss scale: 1.0 | grad norm: 0.889 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure processed_samples 18900 unjoint_samples 18900 joint_samples 56 [569433, 1041705] processed_samples 18900 unjoint_samples 18900 joint_samples 56 [95527, 1046602] processed_samples 18900 unjoint_samples 18900 joint_samples 58 [329619, 1029045] processed_samples 18900 unjoint_samples 18900 joint_samples 57 [1046793, 412118] processed_samples 18900 unjoint_samples 18900 joint_samples 56 [1038546, 728147] processed_samples 18900 unjoint_samples 18900 joint_samples 57 [1047210, 474342] processed_samples 18900 unjoint_samples 18900 joint_samples 58 [317017, 1047141] processed_samples 18900 unjoint_samples 18900 joint_samples 56 [805862, 1047183] [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure processed_samples 18900 unjoint_samples 18900 joint_samples 56 [569433, 1041705] processed_samples 18900 unjoint_samples 18900 joint_samples 56 [95527, 1046602] processed_samples 18900 unjoint_samples 18900 joint_samples 58 [329619, 1029045] processed_samples 18900 unjoint_samples 18900 joint_samples 57 [1046793, 412118] processed_samples 18900 unjoint_samples 18900 joint_samples 56 [1038546, 728147] processed_samples 18900 unjoint_samples 18900 joint_samples 57 [1047210, 474342] processed_samples 18900 unjoint_samples 18900 joint_samples 58 [317017, 1047141] [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure processed_samples 18900 unjoint_samples 18900 joint_samples 56 [805862, 1047183] [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d4ee5f85c0] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d5145cc380] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [2024-12-01 15:15:40] iteration 455/ 500 | consumed samples: 3640 | elapsed time per iteration (ms): 632799.3 | throughput per GPU (TFLOP/s/GPU): 82.2 | learning rate: 2.033476E-07 | global batch size: 8 | lm loss: 8.013310E-01 | loss scale: 1.0 | grad norm: 0.688 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215b9b7740] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure processed_samples 19000 unjoint_samples 19000 joint_samples 57 [65066, 1047183] processed_samples 19000 unjoint_samples 19000 joint_samples 56 [304472, 1046602] processed_samples 19000 unjoint_samples 19000 joint_samples 58 [698928, 1029045] processed_samples 19000 unjoint_samples 19000 joint_samples 56 [886793, 1041705] processed_samples 19000 unjoint_samples 19000 joint_samples 57 [1046793, 730143] [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure processed_samples 19000 unjoint_samples 19000 joint_samples 57 [1047210, 906829] processed_samples 19000 unjoint_samples 19000 joint_samples 58 [603869, 1047141] [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure processed_samples 19000 unjoint_samples 19000 joint_samples 56 [304472, 1046602] processed_samples 19000 unjoint_samples 19000 joint_samples 56 [1043659, 1043971] [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure processed_samples 19000 unjoint_samples 19000 joint_samples 57 [65066, 1047183] processed_samples 19000 unjoint_samples 19000 joint_samples 57 [1046793, 730143] processed_samples 19000 unjoint_samples 19000 joint_samples 58 [698928, 1029045] processed_samples 19000 unjoint_samples 19000 joint_samples 56 [886793, 1041705] processed_samples 19000 unjoint_samples 19000 joint_samples 58 [603869, 1047141] processed_samples 19000 unjoint_samples 19000 joint_samples 56 [1043659, 1043971] [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure processed_samples 19000 unjoint_samples 19000 joint_samples 57 [1047210, 906829] [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [2024-12-01 15:23:19] iteration 456/ 500 | consumed samples: 3648 | elapsed time per iteration (ms): 459434.2 | throughput per GPU (TFLOP/s/GPU): 113.3 | learning rate: 1.988362E-07 | global batch size: 8 | lm loss: 8.068063E-01 | loss scale: 1.0 | grad norm: 0.831 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [2024-12-01 15:32:27] iteration 457/ 500 | consumed samples: 3656 | elapsed time per iteration (ms): 547788.4 | throughput per GPU (TFLOP/s/GPU): 95.0 | learning rate: 1.944234E-07 | global batch size: 8 | lm loss: 8.406883E-01 | loss scale: 1.0 | grad norm: 0.614 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f63a6c0] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215f63a6c0] mmco: unref short failure [h264 @ 0x56215f63a6c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d513099d80] mmco: unref short failure [h264 @ 0x55d513099d80] mmco: unref short failure [2024-12-01 15:41:17] iteration 458/ 500 | consumed samples: 3664 | elapsed time per iteration (ms): 530106.6 | throughput per GPU (TFLOP/s/GPU): 98.2 | learning rate: 1.901095E-07 | global batch size: 8 | lm loss: 8.074347E-01 | loss scale: 1.0 | grad norm: 0.774 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure processed_samples 19100 unjoint_samples 19100 joint_samples 57 [1048091, 24957] [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure processed_samples 19100 unjoint_samples 19100 joint_samples 57 [1046793, 1039709] processed_samples 19100 unjoint_samples 19100 joint_samples 56 [675892, 1046602] processed_samples 19100 unjoint_samples 19100 joint_samples 58 [920268, 1047141] [h264 @ 0x55d5143806c0] mmco: unref short failure processed_samples 19100 unjoint_samples 19100 joint_samples 58 [1047210, 309387] processed_samples 19100 unjoint_samples 19100 joint_samples 57 [409035, 1046382] processed_samples 19100 unjoint_samples 19100 joint_samples 57 [364518, 1047183] [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure processed_samples 19100 unjoint_samples 19100 joint_samples 58 [920268, 1047141] processed_samples 19100 unjoint_samples 19100 joint_samples 57 [1048091, 24957] [h264 @ 0x56215c3e4200] mmco: unref short failure processed_samples 19100 unjoint_samples 19100 joint_samples 58 [1047210, 309387] processed_samples 19100 unjoint_samples 19100 joint_samples 57 [409035, 1046382] processed_samples 19100 unjoint_samples 19100 joint_samples 58 [983695, 1029045] processed_samples 19100 unjoint_samples 19100 joint_samples 57 [364518, 1047183] processed_samples 19100 unjoint_samples 19100 joint_samples 57 [1046793, 1039709] processed_samples 19100 unjoint_samples 19100 joint_samples 58 [983695, 1029045] processed_samples 19100 unjoint_samples 19100 joint_samples 56 [675892, 1046602] [h264 @ 0x55d5171f5180] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [2024-12-01 15:53:37] iteration 459/ 500 | consumed samples: 3672 | elapsed time per iteration (ms): 739678.3 | throughput per GPU (TFLOP/s/GPU): 70.4 | learning rate: 1.858946E-07 | global batch size: 8 | lm loss: 8.207039E-01 | loss scale: 1.0 | grad norm: 0.687 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [2024-12-01 16:03:51] iteration 460/ 500 | consumed samples: 3680 | elapsed time per iteration (ms): 614020.7 | throughput per GPU (TFLOP/s/GPU): 84.8 | learning rate: 1.817789E-07 | global batch size: 8 | lm loss: 7.755750E-01 | loss scale: 1.0 | grad norm: 0.541 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (156851.09, 156851.38) [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d513099d80] mmco: unref short failure [h264 @ 0x55d513099d80] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5145449c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [2024-12-01 16:15:19] iteration 461/ 500 | consumed samples: 3688 | elapsed time per iteration (ms): 531317.9 | throughput per GPU (TFLOP/s/GPU): 97.9 | learning rate: 1.777626E-07 | global batch size: 8 | lm loss: 8.081508E-01 | loss scale: 1.0 | grad norm: 0.616 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure processed_samples 19200 unjoint_samples 19200 joint_samples 57 [737652, 1046382] [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 19200 unjoint_samples 19200 joint_samples 59 [1007487, 313847] processed_samples 19200 unjoint_samples 19200 joint_samples 59 [167467, 1047141] processed_samples 19200 unjoint_samples 19200 joint_samples 57 [1048091, 334724] [h264 @ 0x56215b217200] mmco: unref short failure processed_samples 19200 unjoint_samples 19200 joint_samples 58 [261142, 1041956] processed_samples 19200 unjoint_samples 19200 joint_samples 57 [619786, 1047183] processed_samples 19200 unjoint_samples 19200 joint_samples 58 [1047210, 573972] [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure processed_samples 19200 unjoint_samples 19200 joint_samples 56 [908267, 1046602] [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d512746f00] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure processed_samples 19200 unjoint_samples 19200 joint_samples 57 [737652, 1046382] processed_samples 19200 unjoint_samples 19200 joint_samples 58 [1047210, 573972] [h264 @ 0x56215b7c4500] mmco: unref short failure processed_samples 19200 unjoint_samples 19200 joint_samples 59 [1007487, 313847] processed_samples 19200 unjoint_samples 19200 joint_samples 59 [167467, 1047141] processed_samples 19200 unjoint_samples 19200 joint_samples 57 [1048091, 334724] processed_samples 19200 unjoint_samples 19200 joint_samples 58 [261142, 1041956] processed_samples 19200 unjoint_samples 19200 joint_samples 57 [619786, 1047183] [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure processed_samples 19200 unjoint_samples 19200 joint_samples 56 [908267, 1046602] [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [2024-12-01 16:26:07] iteration 462/ 500 | consumed samples: 3696 | elapsed time per iteration (ms): 648360.6 | throughput per GPU (TFLOP/s/GPU): 80.3 | learning rate: 1.738458E-07 | global batch size: 8 | lm loss: 8.348542E-01 | loss scale: 1.0 | grad norm: 0.687 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d513034b80] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure processed_samples 19300 unjoint_samples 19300 joint_samples 58 [485454, 1041956] processed_samples 19300 unjoint_samples 19300 joint_samples 57 [1000455, 204511] processed_samples 19300 unjoint_samples 19300 joint_samples 59 [1007487, 683177] processed_samples 19300 unjoint_samples 19300 joint_samples 57 [1048091, 729811] processed_samples 19300 unjoint_samples 19300 joint_samples 57 [897921, 1047183] [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure processed_samples 19300 unjoint_samples 19300 joint_samples 58 [1046284, 8585] [h264 @ 0x56215b8cc100] mmco: unref short failure [h264 @ 0x56215b8cc100] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure [h264 @ 0x56215cef2780] mmco: unref short failure processed_samples 19300 unjoint_samples 19300 joint_samples 57 [1000455, 204511] processed_samples 19300 unjoint_samples 19300 joint_samples 58 [485454, 1041956] processed_samples 19300 unjoint_samples 19300 joint_samples 57 [1048091, 729811] processed_samples 19300 unjoint_samples 19300 joint_samples 59 [1007487, 683177] processed_samples 19300 unjoint_samples 19300 joint_samples 57 [897921, 1047183] [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure processed_samples 19300 unjoint_samples 19300 joint_samples 58 [1046284, 8585] [2024-12-01 16:36:01] iteration 463/ 500 | consumed samples: 3704 | elapsed time per iteration (ms): 593789.3 | throughput per GPU (TFLOP/s/GPU): 87.6 | learning rate: 1.700287E-07 | global batch size: 8 | lm loss: 8.368133E-01 | loss scale: 1.0 | grad norm: 0.838 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 19300 unjoint_samples 19300 joint_samples 58 [1047210, 1009908] processed_samples 19300 unjoint_samples 19300 joint_samples 58 [1047210, 1009908] processed_samples 19300 unjoint_samples 19300 joint_samples 59 [363026, 1047141] processed_samples 19300 unjoint_samples 19300 joint_samples 59 [363026, 1047141] [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [2024-12-01 16:44:38] iteration 464/ 500 | consumed samples: 3712 | elapsed time per iteration (ms): 516905.7 | throughput per GPU (TFLOP/s/GPU): 100.7 | learning rate: 1.663114E-07 | global batch size: 8 | lm loss: 8.324036E-01 | loss scale: 1.0 | grad norm: 0.696 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [2024-12-01 16:52:33] iteration 465/ 500 | consumed samples: 3720 | elapsed time per iteration (ms): 475094.9 | throughput per GPU (TFLOP/s/GPU): 109.5 | learning rate: 1.626942E-07 | global batch size: 8 | lm loss: 7.755831E-01 | loss scale: 1.0 | grad norm: 0.612 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 17:02:51] iteration 466/ 500 | consumed samples: 3728 | elapsed time per iteration (ms): 617440.9 | throughput per GPU (TFLOP/s/GPU): 84.3 | learning rate: 1.591772E-07 | global batch size: 8 | lm loss: 7.885929E-01 | loss scale: 1.0 | grad norm: 0.528 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [2024-12-01 17:14:28] iteration 467/ 500 | consumed samples: 3736 | elapsed time per iteration (ms): 697117.0 | throughput per GPU (TFLOP/s/GPU): 74.7 | learning rate: 1.557604E-07 | global batch size: 8 | lm loss: 8.216323E-01 | loss scale: 1.0 | grad norm: 0.602 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 19400 unjoint_samples 19400 joint_samples 58 [816460, 1041956] processed_samples 19400 unjoint_samples 19400 joint_samples 58 [816460, 1041956] [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d5135c10c0] mmco: unref short failure [h264 @ 0x55d5135c10c0] mmco: unref short failure processed_samples 19400 unjoint_samples 19400 joint_samples 59 [1047252, 252965] processed_samples 19400 unjoint_samples 19400 joint_samples 58 [1048091, 71684] processed_samples 19400 unjoint_samples 19400 joint_samples 58 [1048091, 71684] processed_samples 19400 unjoint_samples 19400 joint_samples 59 [1047252, 252965] processed_samples 19400 unjoint_samples 19400 joint_samples 58 [130945, 1047183] processed_samples 19400 unjoint_samples 19400 joint_samples 58 [130945, 1047183] processed_samples 19400 unjoint_samples 19400 joint_samples 60 [200308, 967315] processed_samples 19400 unjoint_samples 19400 joint_samples 58 [1046284, 348046] processed_samples 19400 unjoint_samples 19400 joint_samples 60 [200308, 967315] processed_samples 19400 unjoint_samples 19400 joint_samples 57 [1000455, 463197] processed_samples 19400 unjoint_samples 19400 joint_samples 57 [1000455, 463197] [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure processed_samples 19400 unjoint_samples 19400 joint_samples 58 [1046284, 348046] processed_samples 19400 unjoint_samples 19400 joint_samples 59 [762947, 1047141] processed_samples 19400 unjoint_samples 19400 joint_samples 59 [762947, 1047141] [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215b89c080] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d5135c10c0] mmco: unref short failure [h264 @ 0x55d5135c10c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b10e440] mmco: unref short failure [h264 @ 0x56215b0a5600] mmco: unref short failure [h264 @ 0x55d51bd54680] mmco: unref short failure [h264 @ 0x55d51bd54680] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [2024-12-01 17:23:18] iteration 468/ 500 | consumed samples: 3744 | elapsed time per iteration (ms): 530298.5 | throughput per GPU (TFLOP/s/GPU): 98.1 | learning rate: 1.524441E-07 | global batch size: 8 | lm loss: 8.004850E-01 | loss scale: 1.0 | grad norm: 0.500 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d516b59080] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [2024-12-01 17:33:32] iteration 469/ 500 | consumed samples: 3752 | elapsed time per iteration (ms): 613465.0 | throughput per GPU (TFLOP/s/GPU): 84.8 | learning rate: 1.492284E-07 | global batch size: 8 | lm loss: 8.411949E-01 | loss scale: 1.0 | grad norm: 0.694 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure processed_samples 19500 unjoint_samples 19500 joint_samples 59 [98421, 1044129] processed_samples 19500 unjoint_samples 19500 joint_samples 60 [520072, 967315] processed_samples 19500 unjoint_samples 19500 joint_samples 58 [1048091, 498445] processed_samples 19500 unjoint_samples 19500 joint_samples 59 [1047252, 580686] processed_samples 19500 unjoint_samples 19500 joint_samples 59 [98421, 1044129] processed_samples 19500 unjoint_samples 19500 joint_samples 58 [428724, 1047183] processed_samples 19500 unjoint_samples 19500 joint_samples 58 [428724, 1047183] processed_samples 19500 unjoint_samples 19500 joint_samples 58 [1046284, 654391] processed_samples 19500 unjoint_samples 19500 joint_samples 57 [1000455, 750201] processed_samples 19500 unjoint_samples 19500 joint_samples 58 [1048091, 498445] processed_samples 19500 unjoint_samples 19500 joint_samples 60 [520072, 967315] processed_samples 19500 unjoint_samples 19500 joint_samples 59 [1047252, 580686] processed_samples 19500 unjoint_samples 19500 joint_samples 58 [1046284, 654391] processed_samples 19500 unjoint_samples 19500 joint_samples 60 [1046315, 65258] processed_samples 19500 unjoint_samples 19500 joint_samples 60 [1046315, 65258] [h264 @ 0x56215d350080] mmco: unref short failure processed_samples 19500 unjoint_samples 19500 joint_samples 57 [1000455, 750201] [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x56215af78cc0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51bddb580] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [2024-12-01 17:43:55] iteration 470/ 500 | consumed samples: 3760 | elapsed time per iteration (ms): 623450.1 | throughput per GPU (TFLOP/s/GPU): 83.5 | learning rate: 1.461135E-07 | global batch size: 8 | lm loss: 7.550452E-01 | loss scale: 1.0 | grad norm: 0.538 | number of skipped iterations: 0 | number of nan iterations: 0 | processed_samples 19600 unjoint_samples 19600 joint_samples 59 [375679, 1044129] processed_samples 19600 unjoint_samples 19600 joint_samples 58 [31722, 1040879] processed_samples 19600 unjoint_samples 19600 joint_samples 59 [1047252, 855252] processed_samples 19600 unjoint_samples 19600 joint_samples 58 [663528, 1047183] processed_samples 19600 unjoint_samples 19600 joint_samples 60 [816049, 967315] processed_samples 19600 unjoint_samples 19600 joint_samples 60 [1046315, 403777] processed_samples 19600 unjoint_samples 19600 joint_samples 58 [1048091, 839233] processed_samples 19600 unjoint_samples 19600 joint_samples 58 [1046284, 989007] processed_samples 19600 unjoint_samples 19600 joint_samples 59 [375679, 1044129] processed_samples 19600 unjoint_samples 19600 joint_samples 58 [31722, 1040879] processed_samples 19600 unjoint_samples 19600 joint_samples 58 [1048091, 839233] processed_samples 19600 unjoint_samples 19600 joint_samples 60 [816049, 967315] processed_samples 19600 unjoint_samples 19600 joint_samples 58 [663528, 1047183] processed_samples 19600 unjoint_samples 19600 joint_samples 60 [1046315, 403777] processed_samples 19600 unjoint_samples 19600 joint_samples 59 [1047252, 855252] processed_samples 19600 unjoint_samples 19600 joint_samples 58 [1046284, 989007] [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d51714d1c0] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [2024-12-01 17:53:55] iteration 471/ 500 | consumed samples: 3768 | elapsed time per iteration (ms): 600048.5 | throughput per GPU (TFLOP/s/GPU): 86.7 | learning rate: 1.430994E-07 | global batch size: 8 | lm loss: 8.090125E-01 | loss scale: 1.0 | grad norm: 0.647 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d516aa3180] mmco: unref short failure [h264 @ 0x55d516aa3180] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d516aa3180] mmco: unref short failure [h264 @ 0x55d516aa3180] mmco: unref short failure [h264 @ 0x55d516aa3180] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [2024-12-01 18:03:05] iteration 472/ 500 | consumed samples: 3776 | elapsed time per iteration (ms): 550162.4 | throughput per GPU (TFLOP/s/GPU): 94.6 | learning rate: 1.401863E-07 | global batch size: 8 | lm loss: 8.100052E-01 | loss scale: 1.0 | grad norm: 0.765 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d516b67a00] mmco: unref short failure [h264 @ 0x56215d8bbcc0] mmco: unref short failure [2024-12-01 18:13:30] iteration 473/ 500 | consumed samples: 3784 | elapsed time per iteration (ms): 625071.6 | throughput per GPU (TFLOP/s/GPU): 83.3 | learning rate: 1.373743E-07 | global batch size: 8 | lm loss: 8.084416E-01 | loss scale: 1.0 | grad norm: 0.651 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x55d514051f80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [2024-12-01 18:21:31] iteration 474/ 500 | consumed samples: 3792 | elapsed time per iteration (ms): 480291.6 | throughput per GPU (TFLOP/s/GPU): 108.4 | learning rate: 1.346635E-07 | global batch size: 8 | lm loss: 8.181583E-01 | loss scale: 1.0 | grad norm: 0.827 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure processed_samples 19700 unjoint_samples 19700 joint_samples 59 [84539, 1047531] processed_samples 19700 unjoint_samples 19700 joint_samples 59 [946056, 1044129] processed_samples 19700 unjoint_samples 19700 joint_samples 59 [954011, 205390] processed_samples 19700 unjoint_samples 19700 joint_samples 60 [112581, 1045897] processed_samples 19700 unjoint_samples 19700 joint_samples 59 [184479, 1038729] processed_samples 19700 unjoint_samples 19700 joint_samples 60 [1046315, 844762] processed_samples 19700 unjoint_samples 19700 joint_samples 60 [1015354, 1016036] processed_samples 19700 unjoint_samples 19700 joint_samples 59 [84539, 1047531] processed_samples 19700 unjoint_samples 19700 joint_samples 59 [946056, 1044129] processed_samples 19700 unjoint_samples 19700 joint_samples 60 [112581, 1045897] processed_samples 19700 unjoint_samples 19700 joint_samples 59 [954011, 205390] processed_samples 19700 unjoint_samples 19700 joint_samples 59 [184479, 1038729] processed_samples 19700 unjoint_samples 19700 joint_samples 58 [562460, 1040879] processed_samples 19700 unjoint_samples 19700 joint_samples 60 [1046315, 844762] processed_samples 19700 unjoint_samples 19700 joint_samples 58 [562460, 1040879] processed_samples 19700 unjoint_samples 19700 joint_samples 60 [1015354, 1016036] [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215f8b9c80] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [2024-12-01 18:33:01] iteration 475/ 500 | consumed samples: 3800 | elapsed time per iteration (ms): 690389.1 | throughput per GPU (TFLOP/s/GPU): 75.4 | learning rate: 1.320541E-07 | global batch size: 8 | lm loss: 8.206989E-01 | loss scale: 1.0 | grad norm: 0.651 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d4f080e440] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [2024-12-01 18:44:01] iteration 476/ 500 | consumed samples: 3808 | elapsed time per iteration (ms): 659896.9 | throughput per GPU (TFLOP/s/GPU): 78.9 | learning rate: 1.295461E-07 | global batch size: 8 | lm loss: 8.581502E-01 | loss scale: 1.0 | grad norm: 0.634 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 18:50:58] iteration 477/ 500 | consumed samples: 3816 | elapsed time per iteration (ms): 417078.0 | throughput per GPU (TFLOP/s/GPU): 124.8 | learning rate: 1.271397E-07 | global batch size: 8 | lm loss: 7.727896E-01 | loss scale: 1.0 | grad norm: 0.577 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x56215b25ac00] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure processed_samples 19800 unjoint_samples 19800 joint_samples 61 [265482, 1044262] [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure processed_samples 19800 unjoint_samples 19800 joint_samples 60 [1046343, 311230] processed_samples 19800 unjoint_samples 19800 joint_samples 59 [542442, 1047531] processed_samples 19800 unjoint_samples 19800 joint_samples 61 [232279, 1034490] [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure processed_samples 19800 unjoint_samples 19800 joint_samples 58 [1026052, 1040879] [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure processed_samples 19800 unjoint_samples 19800 joint_samples 59 [462838, 1038729] processed_samples 19800 unjoint_samples 19800 joint_samples 60 [473328, 1045897] processed_samples 19800 unjoint_samples 19800 joint_samples 61 [265482, 1044262] processed_samples 19800 unjoint_samples 19800 joint_samples 61 [232279, 1034490] processed_samples 19800 unjoint_samples 19800 joint_samples 59 [542442, 1047531] processed_samples 19800 unjoint_samples 19800 joint_samples 60 [1046343, 311230] processed_samples 19800 unjoint_samples 19800 joint_samples 59 [954011, 559959] processed_samples 19800 unjoint_samples 19800 joint_samples 58 [1026052, 1040879] processed_samples 19800 unjoint_samples 19800 joint_samples 59 [462838, 1038729] processed_samples 19800 unjoint_samples 19800 joint_samples 60 [473328, 1045897] processed_samples 19800 unjoint_samples 19800 joint_samples 59 [954011, 559959] [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [2024-12-01 19:01:37] iteration 478/ 500 | consumed samples: 3824 | elapsed time per iteration (ms): 639452.6 | throughput per GPU (TFLOP/s/GPU): 81.4 | learning rate: 1.248349E-07 | global batch size: 8 | lm loss: 7.901743E-01 | loss scale: 1.0 | grad norm: 0.554 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d5173c1180] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d516aa3180] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x562163b30a80] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure processed_samples 19900 unjoint_samples 19900 joint_samples 59 [1046416, 204673] [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure processed_samples 19900 unjoint_samples 19900 joint_samples 59 [977170, 980548] processed_samples 19900 unjoint_samples 19900 joint_samples 60 [1046343, 707013] processed_samples 19900 unjoint_samples 19900 joint_samples 59 [851276, 1047531] processed_samples 19900 unjoint_samples 19900 joint_samples 61 [525522, 1034490] [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure processed_samples 19900 unjoint_samples 19900 joint_samples 60 [795319, 1045897] processed_samples 19900 unjoint_samples 19900 joint_samples 61 [601682, 1044262] processed_samples 19900 unjoint_samples 19900 joint_samples 59 [746959, 1038729] [h264 @ 0x55d51314e5c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215b35e3c0] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure processed_samples 19900 unjoint_samples 19900 joint_samples 61 [525522, 1034490] processed_samples 19900 unjoint_samples 19900 joint_samples 59 [1046416, 204673] processed_samples 19900 unjoint_samples 19900 joint_samples 61 [601682, 1044262] processed_samples 19900 unjoint_samples 19900 joint_samples 60 [1046343, 707013] [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure processed_samples 19900 unjoint_samples 19900 joint_samples 59 [851276, 1047531] processed_samples 19900 unjoint_samples 19900 joint_samples 59 [746959, 1038729] processed_samples 19900 unjoint_samples 19900 joint_samples 59 [977170, 980548] processed_samples 19900 unjoint_samples 19900 joint_samples 60 [795319, 1045897] [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x55d514594d00] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215e2d08c0] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [2024-12-01 19:14:07] iteration 479/ 500 | consumed samples: 3832 | elapsed time per iteration (ms): 749352.0 | throughput per GPU (TFLOP/s/GPU): 69.4 | learning rate: 1.226319E-07 | global batch size: 8 | lm loss: 7.682730E-01 | loss scale: 1.0 | grad norm: 0.636 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [2024-12-01 19:21:56] iteration 480/ 500 | consumed samples: 3840 | elapsed time per iteration (ms): 468975.4 | throughput per GPU (TFLOP/s/GPU): 111.0 | learning rate: 1.205308E-07 | global batch size: 8 | lm loss: 7.834165E-01 | loss scale: 1.0 | grad norm: 0.903 | number of skipped iterations: 0 | number of nan iterations: 0 | (min, max) time across ranks (ms): save-checkpoint ................................: (269678.47, 269678.92) [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [2024-12-01 19:36:42] iteration 481/ 500 | consumed samples: 3848 | elapsed time per iteration (ms): 616920.2 | throughput per GPU (TFLOP/s/GPU): 84.4 | learning rate: 1.185315E-07 | global batch size: 8 | lm loss: 7.638129E-01 | loss scale: 1.0 | grad norm: 0.725 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure processed_samples 20000 unjoint_samples 20000 joint_samples 60 [1046343, 1010438] processed_samples 20000 unjoint_samples 20000 joint_samples 60 [1044665, 100033] processed_samples 20000 unjoint_samples 20000 joint_samples 60 [312813, 998374] processed_samples 20000 unjoint_samples 20000 joint_samples 59 [1046416, 608727] processed_samples 20000 unjoint_samples 20000 joint_samples 61 [914952, 1044262] processed_samples 20000 unjoint_samples 20000 joint_samples 60 [1046343, 1010438] processed_samples 20000 unjoint_samples 20000 joint_samples 60 [1044665, 100033] processed_samples 20000 unjoint_samples 20000 joint_samples 59 [1046416, 608727] processed_samples 20000 unjoint_samples 20000 joint_samples 60 [312813, 998374] processed_samples 20000 unjoint_samples 20000 joint_samples 59 [1012305, 1038729] [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 20000 unjoint_samples 20000 joint_samples 61 [914952, 1044262] processed_samples 20000 unjoint_samples 20000 joint_samples 61 [1047462, 61045] processed_samples 20000 unjoint_samples 20000 joint_samples 59 [1012305, 1038729] [h264 @ 0x56215c492cc0] mmco: unref short failure processed_samples 20000 unjoint_samples 20000 joint_samples 61 [1047462, 61045] processed_samples 20000 unjoint_samples 20000 joint_samples 61 [812346, 1034490] [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure processed_samples 20000 unjoint_samples 20000 joint_samples 61 [812346, 1034490] [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [2024-12-01 19:43:55] iteration 482/ 500 | consumed samples: 3856 | elapsed time per iteration (ms): 432689.8 | throughput per GPU (TFLOP/s/GPU): 120.3 | learning rate: 1.166343E-07 | global batch size: 8 | lm loss: 7.994441E-01 | loss scale: 1.0 | grad norm: 0.656 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 19:57:16] iteration 483/ 500 | consumed samples: 3864 | elapsed time per iteration (ms): 800993.8 | throughput per GPU (TFLOP/s/GPU): 65.0 | learning rate: 1.148392E-07 | global batch size: 8 | lm loss: 8.334441E-01 | loss scale: 1.0 | grad norm: 0.609 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d5149ada40] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d516aee0c0] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [2024-12-01 20:06:06] iteration 484/ 500 | consumed samples: 3872 | elapsed time per iteration (ms): 529985.7 | throughput per GPU (TFLOP/s/GPU): 98.2 | learning rate: 1.131463E-07 | global batch size: 8 | lm loss: 8.393610E-01 | loss scale: 1.0 | grad norm: 0.637 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x55d516adca40] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x56215d350080] mmco: unref short failure [h264 @ 0x55d51300b900] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x55d516aa3180] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215babf100] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d51478a440] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x56215c077380] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x55d5171918c0] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [2024-12-01 20:14:04] iteration 485/ 500 | consumed samples: 3880 | elapsed time per iteration (ms): 477879.8 | throughput per GPU (TFLOP/s/GPU): 108.9 | learning rate: 1.115556E-07 | global batch size: 8 | lm loss: 8.007555E-01 | loss scale: 1.0 | grad norm: 0.675 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x5621606a3fc0] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure processed_samples 20100 unjoint_samples 20100 joint_samples 61 [1047462, 266841] processed_samples 20100 unjoint_samples 20100 joint_samples 61 [400497, 1033549] processed_samples 20100 unjoint_samples 20100 joint_samples 60 [668134, 998374] processed_samples 20100 unjoint_samples 20100 joint_samples 62 [191447, 1044311] processed_samples 20100 unjoint_samples 20100 joint_samples 62 [65666, 1040241] processed_samples 20100 unjoint_samples 20100 joint_samples 60 [275657, 1044774] processed_samples 20100 unjoint_samples 20100 joint_samples 60 [1044665, 487196] processed_samples 20100 unjoint_samples 20100 joint_samples 59 [1046416, 1024246] [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure processed_samples 20100 unjoint_samples 20100 joint_samples 61 [1047462, 266841] processed_samples 20100 unjoint_samples 20100 joint_samples 61 [400497, 1033549] processed_samples 20100 unjoint_samples 20100 joint_samples 60 [668134, 998374] processed_samples 20100 unjoint_samples 20100 joint_samples 62 [65666, 1040241] processed_samples 20100 unjoint_samples 20100 joint_samples 62 [191447, 1044311] processed_samples 20100 unjoint_samples 20100 joint_samples 60 [275657, 1044774] processed_samples 20100 unjoint_samples 20100 joint_samples 60 [1044665, 487196] processed_samples 20100 unjoint_samples 20100 joint_samples 59 [1046416, 1024246] [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x55d51407fb00] mmco: unref short failure [2024-12-01 20:22:12] iteration 486/ 500 | consumed samples: 3888 | elapsed time per iteration (ms): 487957.7 | throughput per GPU (TFLOP/s/GPU): 106.7 | learning rate: 1.100672E-07 | global batch size: 8 | lm loss: 8.267097E-01 | loss scale: 1.0 | grad norm: 0.748 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x55d508176540] mmco: unref short failure [h264 @ 0x56215c5f1200] mmco: unref short failure [h264 @ 0x55d5135b3b40] mmco: unref short failure [h264 @ 0x55d5135b3b40] mmco: unref short failure [h264 @ 0x55d517cb1000] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215b523440] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c3b4780] mmco: unref short failure [h264 @ 0x56215c3b4780] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d5135b3b40] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d5135b3b40] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x56215f5b0d40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x55d5139fcf40] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure processed_samples 20200 unjoint_samples 20200 joint_samples 62 [531865, 1044311] processed_samples 20200 unjoint_samples 20200 joint_samples 60 [344919, 1038046] [h264 @ 0x56215b1efec0] mmco: unref short failure processed_samples 20200 unjoint_samples 20200 joint_samples 60 [1044665, 952866] processed_samples 20200 unjoint_samples 20200 joint_samples 60 [584681, 1044774] processed_samples 20200 unjoint_samples 20200 joint_samples 61 [1047462, 697513] processed_samples 20200 unjoint_samples 20200 joint_samples 61 [645939, 1033549] processed_samples 20200 unjoint_samples 20200 joint_samples 62 [413173, 1040241] [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure processed_samples 20200 unjoint_samples 20200 joint_samples 60 [945907, 998374] [h264 @ 0x55d5145a1340] mmco: unref short failure processed_samples 20200 unjoint_samples 20200 joint_samples 62 [531865, 1044311] processed_samples 20200 unjoint_samples 20200 joint_samples 60 [344919, 1038046] processed_samples 20200 unjoint_samples 20200 joint_samples 62 [413173, 1040241] processed_samples 20200 unjoint_samples 20200 joint_samples 60 [1044665, 952866] processed_samples 20200 unjoint_samples 20200 joint_samples 61 [645939, 1033549] processed_samples 20200 unjoint_samples 20200 joint_samples 60 [584681, 1044774] processed_samples 20200 unjoint_samples 20200 joint_samples 60 [945907, 998374] processed_samples 20200 unjoint_samples 20200 joint_samples 61 [1047462, 697513] [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215b0a7480] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215be9ff00] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [2024-12-01 20:33:26] iteration 487/ 500 | consumed samples: 3896 | elapsed time per iteration (ms): 674174.1 | throughput per GPU (TFLOP/s/GPU): 77.2 | learning rate: 1.086813E-07 | global batch size: 8 | lm loss: 7.753429E-01 | loss scale: 1.0 | grad norm: 0.588 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [2024-12-01 20:41:55] iteration 488/ 500 | consumed samples: 3904 | elapsed time per iteration (ms): 508561.6 | throughput per GPU (TFLOP/s/GPU): 102.3 | learning rate: 1.073977E-07 | global batch size: 8 | lm loss: 8.101053E-01 | loss scale: 1.0 | grad norm: 0.628 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x56215b977f80] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215b1efec0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d512dcb440] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x56215cf234c0] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d51b873240] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure processed_samples 20300 unjoint_samples 20300 joint_samples 61 [192732, 1046867] processed_samples 20300 unjoint_samples 20300 joint_samples 61 [171092, 1046713] processed_samples 20300 unjoint_samples 20300 joint_samples 62 [836409, 1044311] processed_samples 20300 unjoint_samples 20300 joint_samples 60 [867649, 1044774] processed_samples 20300 unjoint_samples 20300 joint_samples 61 [1047462, 963152] processed_samples 20300 unjoint_samples 20300 joint_samples 62 [716439, 1040241] processed_samples 20300 unjoint_samples 20300 joint_samples 61 [945851, 1033549] processed_samples 20300 unjoint_samples 20300 joint_samples 60 [618860, 1038046] [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure processed_samples 20300 unjoint_samples 20300 joint_samples 60 [618860, 1038046] processed_samples 20300 unjoint_samples 20300 joint_samples 61 [192732, 1046867] processed_samples 20300 unjoint_samples 20300 joint_samples 61 [171092, 1046713] processed_samples 20300 unjoint_samples 20300 joint_samples 62 [836409, 1044311] processed_samples 20300 unjoint_samples 20300 joint_samples 62 [716439, 1040241] processed_samples 20300 unjoint_samples 20300 joint_samples 60 [867649, 1044774] processed_samples 20300 unjoint_samples 20300 joint_samples 61 [1047462, 963152] processed_samples 20300 unjoint_samples 20300 joint_samples 61 [945851, 1033549] [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [h264 @ 0x56215ba1fac0] mmco: unref short failure [2024-12-01 20:51:39] iteration 489/ 500 | consumed samples: 3912 | elapsed time per iteration (ms): 584105.2 | throughput per GPU (TFLOP/s/GPU): 89.1 | learning rate: 1.062166E-07 | global batch size: 8 | lm loss: 8.325140E-01 | loss scale: 1.0 | grad norm: 0.651 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x55d519613300] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [h264 @ 0x56215b47d3c0] mmco: unref short failure [2024-12-01 20:59:55] iteration 490/ 500 | consumed samples: 3920 | elapsed time per iteration (ms): 496115.7 | throughput per GPU (TFLOP/s/GPU): 104.9 | learning rate: 1.051381E-07 | global batch size: 8 | lm loss: 7.974008E-01 | loss scale: 1.0 | grad norm: 0.660 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x55d517306a80] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x56215bcba400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x56215b6ebf40] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [h264 @ 0x56215b706800] mmco: unref short failure [2024-12-01 21:10:36] iteration 491/ 500 | consumed samples: 3928 | elapsed time per iteration (ms): 640917.9 | throughput per GPU (TFLOP/s/GPU): 81.2 | learning rate: 1.041621E-07 | global batch size: 8 | lm loss: 8.110026E-01 | loss scale: 1.0 | grad norm: 0.596 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215b7c4500] mmco: unref short failure [h264 @ 0x55d51340ff80] mmco: unref short failure [2024-12-01 21:21:02] iteration 492/ 500 | consumed samples: 3936 | elapsed time per iteration (ms): 625653.9 | throughput per GPU (TFLOP/s/GPU): 83.2 | learning rate: 1.032888E-07 | global batch size: 8 | lm loss: 8.021253E-01 | loss scale: 1.0 | grad norm: 0.913 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x56215ec3c040] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x55d5143806c0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215ccdcdc0] mmco: unref short failure [h264 @ 0x56215e474480] mmco: unref short failure processed_samples 20400 unjoint_samples 20400 joint_samples 61 [603384, 1046713] processed_samples 20400 unjoint_samples 20400 joint_samples 63 [73355, 1045348] [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure processed_samples 20400 unjoint_samples 20400 joint_samples 62 [1046106, 124157] [h264 @ 0x56215f6b5640] mmco: unref short failure [h264 @ 0x56215f6b5640] mmco: unref short failure processed_samples 20400 unjoint_samples 20400 joint_samples 62 [1047462, 335378] processed_samples 20400 unjoint_samples 20400 joint_samples 61 [1045911, 122833] [h264 @ 0x55d51a586540] mmco: unref short failure [h264 @ 0x55d51a586540] mmco: unref short failure processed_samples 20400 unjoint_samples 20400 joint_samples 61 [609605, 1046867] processed_samples 20400 unjoint_samples 20400 joint_samples 60 [989778, 1038046] processed_samples 20400 unjoint_samples 20400 joint_samples 62 [974322, 1040241] [h264 @ 0x55d5171918c0] mmco: unref short failure processed_samples 20400 unjoint_samples 20400 joint_samples 61 [603384, 1046713] processed_samples 20400 unjoint_samples 20400 joint_samples 63 [73355, 1045348] [h264 @ 0x55d516c21040] mmco: unref short failure [h264 @ 0x55d516c21040] mmco: unref short failure processed_samples 20400 unjoint_samples 20400 joint_samples 61 [1045911, 122833] processed_samples 20400 unjoint_samples 20400 joint_samples 62 [1046106, 124157] processed_samples 20400 unjoint_samples 20400 joint_samples 62 [1047462, 335378] processed_samples 20400 unjoint_samples 20400 joint_samples 61 [609605, 1046867] processed_samples 20400 unjoint_samples 20400 joint_samples 60 [989778, 1038046] processed_samples 20400 unjoint_samples 20400 joint_samples 62 [974322, 1040241] [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x56215ee100c0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [2024-12-01 21:28:07] iteration 493/ 500 | consumed samples: 3944 | elapsed time per iteration (ms): 425120.9 | throughput per GPU (TFLOP/s/GPU): 122.4 | learning rate: 1.025181E-07 | global batch size: 8 | lm loss: 8.234768E-01 | loss scale: 1.0 | grad norm: 0.632 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514afd580] mmco: unref short failure [h264 @ 0x55d514426700] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [2024-12-01 21:38:03] iteration 494/ 500 | consumed samples: 3952 | elapsed time per iteration (ms): 596715.8 | throughput per GPU (TFLOP/s/GPU): 87.2 | learning rate: 1.018501E-07 | global batch size: 8 | lm loss: 7.998064E-01 | loss scale: 1.0 | grad norm: 0.650 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x56215bcafa40] mmco: unref short failure [h264 @ 0x55d517193e00] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x55d517cb9180] mmco: unref short failure [h264 @ 0x5621618e98c0] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b928840] mmco: unref short failure [h264 @ 0x56215b928840] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d51401df80] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215cbc4280] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x56215f845e80] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d51421e400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1040126, 271248] processed_samples 20500 unjoint_samples 20500 joint_samples 62 [1046106, 426680] processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1045911, 338188] processed_samples 20500 unjoint_samples 20500 joint_samples 63 [1047251, 198669] processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1021682, 1046867] processed_samples 20500 unjoint_samples 20500 joint_samples 63 [362330, 1045348] processed_samples 20500 unjoint_samples 20500 joint_samples 61 [913941, 1046713] processed_samples 20500 unjoint_samples 20500 joint_samples 62 [1047462, 561982] [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x56215bb8bc00] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure [h264 @ 0x56215b217200] mmco: unref short failure processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1045911, 338188] processed_samples 20500 unjoint_samples 20500 joint_samples 62 [1046106, 426680] processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1040126, 271248] processed_samples 20500 unjoint_samples 20500 joint_samples 63 [1047251, 198669] [h264 @ 0x55d51bcf3900] mmco: unref short failure processed_samples 20500 unjoint_samples 20500 joint_samples 61 [1021682, 1046867] processed_samples 20500 unjoint_samples 20500 joint_samples 63 [362330, 1045348] [h264 @ 0x55d51bcf3900] mmco: unref short failure [h264 @ 0x55d51bcf3900] mmco: unref short failure processed_samples 20500 unjoint_samples 20500 joint_samples 61 [913941, 1046713] processed_samples 20500 unjoint_samples 20500 joint_samples 62 [1047462, 561982] [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215f571a00] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d514409540] mmco: unref short failure [h264 @ 0x5621608a8680] mmco: unref short failure [2024-12-01 21:47:09] iteration 495/ 500 | consumed samples: 3960 | elapsed time per iteration (ms): 545102.0 | throughput per GPU (TFLOP/s/GPU): 95.5 | learning rate: 1.012849E-07 | global batch size: 8 | lm loss: 8.540187E-01 | loss scale: 1.0 | grad norm: 0.621 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x55d512fcdfc0] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bfb3a00] mmco: unref short failure [h264 @ 0x56215bb86100] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x56215c492cc0] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x55d51722b380] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x56215f060d00] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d5194f31c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x55d51b2c01c0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215b17e600] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51376ab40] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x56215bb50880] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d516fe4400] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [2024-12-01 21:54:43] iteration 496/ 500 | consumed samples: 3968 | elapsed time per iteration (ms): 454849.5 | throughput per GPU (TFLOP/s/GPU): 114.4 | learning rate: 1.008223E-07 | global batch size: 8 | lm loss: 8.191250E-01 | loss scale: 1.0 | grad norm: 0.725 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d516ae3f00] mmco: unref short failure [h264 @ 0x55d5183f9f40] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x55d5145a1340] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x56215d8ddd40] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51326c900] mmco: unref short failure [h264 @ 0x55d51713dd00] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215bf9f540] mmco: unref short failure [h264 @ 0x56215baca280] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d51735fc80] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d517608ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d514b7c6c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215f0f40c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215d4974c0] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure processed_samples 20600 unjoint_samples 20600 joint_samples 61 [1040126, 588687] processed_samples 20600 unjoint_samples 20600 joint_samples 62 [312567, 1046867] processed_samples 20600 unjoint_samples 20600 joint_samples 62 [177108, 1046713] processed_samples 20600 unjoint_samples 20600 joint_samples 62 [1047462, 923886] processed_samples 20600 unjoint_samples 20600 joint_samples 61 [1045911, 666328] processed_samples 20600 unjoint_samples 20600 joint_samples 63 [1047251, 636474] processed_samples 20600 unjoint_samples 20600 joint_samples 63 [641892, 1045348] processed_samples 20600 unjoint_samples 20600 joint_samples 62 [1046106, 784127] [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x55d5130aa740] mmco: unref short failure [h264 @ 0x56215d4b4580] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure [h264 @ 0x55d518378ac0] mmco: unref short failure processed_samples 20600 unjoint_samples 20600 joint_samples 61 [1040126, 588687] processed_samples 20600 unjoint_samples 20600 joint_samples 62 [312567, 1046867] processed_samples 20600 unjoint_samples 20600 joint_samples 62 [177108, 1046713] processed_samples 20600 unjoint_samples 20600 joint_samples 63 [1047251, 636474] processed_samples 20600 unjoint_samples 20600 joint_samples 61 [1045911, 666328] processed_samples 20600 unjoint_samples 20600 joint_samples 63 [641892, 1045348] processed_samples 20600 unjoint_samples 20600 joint_samples 62 [1047462, 923886] processed_samples 20600 unjoint_samples 20600 joint_samples 62 [1046106, 784127] [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x562161968b80] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d51368aac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x55d516907ac0] mmco: unref short failure [h264 @ 0x56215d83de00] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x562161d176c0] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x55d515131d40] mmco: unref short failure [h264 @ 0x56215e692c80] mmco: unref short failure [h264 @ 0x56215e692c80] mmco: unref short failure [h264 @ 0x56215b75cc40] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [2024-12-01 22:05:19] iteration 497/ 500 | consumed samples: 3976 | elapsed time per iteration (ms): 635852.3 | throughput per GPU (TFLOP/s/GPU): 81.8 | learning rate: 1.004626E-07 | global batch size: 8 | lm loss: 8.748317E-01 | loss scale: 1.0 | grad norm: 0.712 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x55d5144ae0c0] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x56215c18da40] mmco: unref short failure [h264 @ 0x55d516e74e00] mmco: unref short failure [h264 @ 0x562163486b40] mmco: unref short failure [h264 @ 0x55d5139fff40] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x55d513765580] mmco: unref short failure [h264 @ 0x562160004ec0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x56215c8ed2c0] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x55d5141fd280] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215d733d80] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x56215c354fc0] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d516b7ca40] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x55d513779c00] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215cf77480] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [h264 @ 0x56215c19aa80] mmco: unref short failure [2024-12-01 22:15:15] iteration 498/ 500 | consumed samples: 3984 | elapsed time per iteration (ms): 595413.6 | throughput per GPU (TFLOP/s/GPU): 87.4 | learning rate: 1.002056E-07 | global batch size: 8 | lm loss: 8.245853E-01 | loss scale: 1.0 | grad norm: 0.575 | number of skipped iterations: 0 | number of nan iterations: 0 | [h264 @ 0x562160944a00] mmco: unref short failure [h264 @ 0x55d513a64780] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x55d5148de040] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x56215be6bd40] mmco: unref short failure [h264 @ 0x55d51a1f0840] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d5141f5f40] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x55d51723b300] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure [h264 @ 0x56215b3f09c0] mmco: unref short failure processed_samples 20700 unjoint_samples 20700 joint_samples 62 [652277, 1046867] [h264 @ 0x56215c37f280] mmco: unref short failure [h264 @ 0x56215c37f280] mmco: unref short failure processed_samples 20700 unjoint_samples 20700 joint_samples 63 [1046106, 76576] processed_samples 20700 unjoint_samples 20700 joint_samples 62 [531223, 1046713] processed_samples 20700 unjoint_samples 20700 joint_samples 63 [890598, 1045348] processed_samples 20700 unjoint_samples 20700 joint_samples 63 [254512, 1032742] processed_samples 20700 unjoint_samples 20700 joint_samples 61 [1040126, 867371] processed_samples 20700 unjoint_samples 20700 joint_samples 63 [1047251, 935193] processed_samples 20700 unjoint_samples 20700 joint_samples 61 [1045911, 992219] [h264 @ 0x55d51a586540] mmco: unref short failure processed_samples 20700 unjoint_samples 20700 joint_samples 62 [652277, 1046867] processed_samples 20700 unjoint_samples 20700 joint_samples 63 [1046106, 76576] processed_samples 20700 unjoint_samples 20700 joint_samples 62 [531223, 1046713] processed_samples 20700 unjoint_samples 20700 joint_samples 63 [890598, 1045348] processed_samples 20700 unjoint_samples 20700 joint_samples 63 [254512, 1032742] processed_samples 20700 unjoint_samples 20700 joint_samples 63 [1047251, 935193] processed_samples 20700 unjoint_samples 20700 joint_samples 61 [1040126, 867371] processed_samples 20700 unjoint_samples 20700 joint_samples 61 [1045911, 992219] [h264 @ 0x56215b2188c0] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d514261240] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x55d513904400] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215b4e5f40] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215c7c6240] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [h264 @ 0x56215cc254c0] mmco: unref short failure [2024-12-01 22:23:50] iteration 499/ 500 | consumed samples: 3992 | elapsed time per iteration (ms): 514810.0 | throughput per GPU (TFLOP/s/GPU): 101.1 | learning rate: 1.000514E-07 | global batch size: 8 | lm loss: 8.215189E-01 | loss scale: 1.0 | grad norm: 1.011 | number of skipped iterations: 0 | number of nan iterations: 0 | [2024-12-01 22:33:14] iteration 500/ 500 | consumed samples: 4000 | elapsed time per iteration (ms): 564651.1 | throughput per GPU (TFLOP/s/GPU): 92.2 | learning rate: 1.000000E-07 | global batch size: 8 | lm loss: 8.424357E-01 | loss scale: 1.0 | grad norm: 0.598 | number of skipped iterations: 0 | number of nan iterations: 0 | [Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2'[Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2'[Errno 19] No such device: 'data_2'[Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2' [Errno 19] No such device: 'data_2' (min, max) time across ranks (ms): save-checkpoint ................................: (259269.35, 259269.62) [2024-12-01 22:44:31,628] torch.distributed.elastic.agent.server.api: [ERROR] Error waiting on exit barrier. Elapsed: 300.10674571990967 seconds + set +x