diff --git "a/experiments/output.log" "b/experiments/output.log" deleted file mode 100644--- "a/experiments/output.log" +++ /dev/null @@ -1,2244 +0,0 @@ -nohup: ignoring input -[2023-02-21 19:51:35,707] [WARNING] [runner.py:186:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. -[2023-02-21 19:51:35,765] [INFO] [runner.py:548:main] cmd = /opt/conda/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None tune_gpt.py --deepspeed deepspeed.json --upload-experiment -/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. - from pandas import MultiIndex, Int64Index -[2023-02-21 19:51:38,485] [INFO] [launch.py:135:main] 0 NCCL_VERSION=2.11.4 -[2023-02-21 19:51:38,485] [INFO] [launch.py:142:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]} -[2023-02-21 19:51:38,485] [INFO] [launch.py:148:main] nnodes=1, num_local_procs=8, node_rank=0 -[2023-02-21 19:51:38,485] [INFO] [launch.py:161:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}) -[2023-02-21 19:51:38,485] [INFO] [launch.py:162:main] dist_world_size=8 -[2023-02-21 19:51:38,485] [INFO] [launch.py:164:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. - from pandas import MultiIndex, Int64Index -/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. - from pandas import MultiIndex, Int64Index -/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. - from pandas import MultiIndex, Int64Index -/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. - from pandas import MultiIndex, Int64Index -/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. - from pandas import MultiIndex, Int64Index -/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. - from pandas import MultiIndex, Int64Index -/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. - from pandas import MultiIndex, Int64Index -/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. - from pandas import MultiIndex, Int64Index -No config specified, defaulting to: apps/all -Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5) -No config specified, defaulting to: apps/all -Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5) -No config specified, defaulting to: apps/all -Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5) -No config specified, defaulting to: apps/all -No config specified, defaulting to: apps/all -Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5) -No config specified, defaulting to: apps/all -Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5) -Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5) -No config specified, defaulting to: apps/all -Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5) -No config specified, defaulting to: apps/all -Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5) -Max length: 2048 -PyTorch: setting up devices -[2023-02-21 19:51:59,246] [INFO] [comm.py:657:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl -Max length: 2048 -PyTorch: setting up devices -Max length: 2048 -PyTorch: setting up devices -Max length: 2048 -PyTorch: setting up devices -Max length: 2048Max length: 2048 - -PyTorch: setting up devices -PyTorch: setting up devices -Max length: 2048 -PyTorch: setting up devices -Max length: 2048 -PyTorch: setting up devices -The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-). -The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-). -The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-). -The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-). -The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-). -The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-). -The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-). -The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-). -GPU memory occupied: 7801 MB. -GPU memory occupied: 7801 MB. -[2023-02-21 19:51:59,917] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed info: version=0.8.1, git-hash=unknown, git-branch=unknown -GPU memory occupied: 7801 MB. -GPU memory occupied: 7801 MB. -GPU memory occupied: 7801 MB. -GPU memory occupied: 7801 MB. -GPU memory occupied: 7801 MB. -GPU memory occupied: 7801 MB. -[2023-02-21 19:52:02,386] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False -Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination -Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination -Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination -Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination -Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination -Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination -Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination -Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Detected CUDA files, patching ldflags -Emitting ninja build file /home/.cache/torch_extensions/py38_cu117/cpu_adam/build.ninja... -Building extension module cpu_adam... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -ninja: no work to do. -Loading extension module cpu_adam... -Time to load cpu_adam op: 3.051539182662964 seconds -Loading extension module cpu_adam... -Time to load cpu_adam op: 2.8587288856506348 seconds -Loading extension module cpu_adam... -Time to load cpu_adam op: 2.8896634578704834 seconds -Loading extension module cpu_adam... -Time to load cpu_adam op: 2.8358540534973145 seconds -Loading extension module cpu_adam... -Time to load cpu_adam op: 2.7415549755096436 seconds -Loading extension module cpu_adam... -Time to load cpu_adam op: 2.8722808361053467 seconds -Loading extension module cpu_adam... -Time to load cpu_adam op: 2.8915891647338867 seconds -Loading extension module cpu_adam... -Time to load cpu_adam op: 3.0029048919677734 seconds -Adam Optimizer #0 is created with AVX512 arithmetic capability. -Config: alpha=0.000100, betas=(0.900000, 0.999000), weight_decay=0.100000, adam_w=1 -[2023-02-21 19:52:09,340] [INFO] [logging.py:75:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer -[2023-02-21 19:52:09,347] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam -[2023-02-21 19:52:09,347] [INFO] [utils.py:53:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type= -[2023-02-21 19:52:09,347] [INFO] [logging.py:75:log_dist] [Rank 0] Creating torch.float32 ZeRO stage 2 optimizer -[2023-02-21 19:52:09,348] [INFO] [stage_1_and_2.py:144:__init__] Reduce bucket size 500000000 -[2023-02-21 19:52:09,348] [INFO] [stage_1_and_2.py:145:__init__] Allgather bucket size 500000000 -[2023-02-21 19:52:09,348] [INFO] [stage_1_and_2.py:146:__init__] CPU Offload: True -[2023-02-21 19:52:09,348] [INFO] [stage_1_and_2.py:147:__init__] Round robin gradient partitioning: False -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Emitting ninja build file /home/.cache/torch_extensions/py38_cu117/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils... -Time to load utils op: 0.5143241882324219 seconds -Loading extension module utils... -Time to load utils op: 0.4041626453399658 seconds -Loading extension module utils... -Time to load utils op: 0.40406346321105957 seconds -Loading extension module utils... -Time to load utils op: 0.40425753593444824 seconds -Loading extension module utils... -Time to load utils op: 0.40483736991882324 seconds -Loading extension module utils... -Loading extension module utils... -Time to load utils op: 0.4042036533355713 seconds -Time to load utils op: 0.4030454158782959 seconds -Loading extension module utils... -Time to load utils op: 0.4040055274963379 seconds -Rank: 5 partition count [8] and sizes[(15649824, False)] -Rank: 0 partition count [8] and sizes[(15649824, False)] -Rank: 2 partition count [8] and sizes[(15649824, False)] -Rank: 3 partition count [8] and sizes[(15649824, False)] -Rank: 4 partition count [8] and sizes[(15649824, False)] -Rank: 6 partition count [8] and sizes[(15649824, False)] -Rank: 1 partition count [8] and sizes[(15649824, False)] -Rank: 7 partition count [8] and sizes[(15649824, False)] -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0007538795471191406 seconds -[2023-02-21 19:52:14,395] [INFO] [utils.py:825:see_memory_usage] Before initializing optimizer states -[2023-02-21 19:52:14,405] [INFO] [utils.py:826:see_memory_usage] MA 0.66 GB Max_MA 0.66 GB CA 0.85 GB Max_CA 1 GB -[2023-02-21 19:52:14,405] [INFO] [utils.py:834:see_memory_usage] CPU Virtual Memory: used = 42.5 GB, percent = 16.9% -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.005951404571533203 seconds -You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding. -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0007529258728027344 seconds -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.020640850067138672 seconds -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0007100105285644531 seconds -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0004687309265136719 seconds -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0029997825622558594 seconds -You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding. -You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding. -You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding. -You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding. -You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding. -You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding. -[2023-02-21 19:52:14,635] [INFO] [utils.py:825:see_memory_usage] After initializing optimizer states -[2023-02-21 19:52:14,635] [INFO] [utils.py:826:see_memory_usage] MA 0.66 GB Max_MA 0.66 GB CA 0.85 GB Max_CA 1 GB -[2023-02-21 19:52:14,636] [INFO] [utils.py:834:see_memory_usage] CPU Virtual Memory: used = 42.46 GB, percent = 16.9% -[2023-02-21 19:52:14,636] [INFO] [stage_1_and_2.py:527:__init__] optimizer state initialized -[2023-02-21 19:52:14,719] [INFO] [utils.py:825:see_memory_usage] After initializing ZeRO optimizer -[2023-02-21 19:52:14,720] [INFO] [utils.py:826:see_memory_usage] MA 0.66 GB Max_MA 0.66 GB CA 0.85 GB Max_CA 1 GB -[2023-02-21 19:52:14,720] [INFO] [utils.py:834:see_memory_usage] CPU Virtual Memory: used = 42.46 GB, percent = 16.9% -[2023-02-21 19:52:14,721] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw -[2023-02-21 19:52:14,721] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed using configured LR scheduler = WarmupLR -[2023-02-21 19:52:14,721] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2023-02-21 19:52:14,721] [INFO] [logging.py:75:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0001], mom=[[0.9, 0.999]] -[2023-02-21 19:52:14,722] [INFO] [config.py:1009:print] DeepSpeedEngine configuration: -[2023-02-21 19:52:14,722] [INFO] [config.py:1013:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2023-02-21 19:52:14,722] [INFO] [config.py:1013:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2023-02-21 19:52:14,722] [INFO] [config.py:1013:print] amp_enabled .................. False -[2023-02-21 19:52:14,722] [INFO] [config.py:1013:print] amp_params ................... False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] autotuning_config ............ { - "enabled": false, - "start_step": null, - "end_step": null, - "metric_path": null, - "arg_mappings": null, - "metric": "throughput", - "model_info": null, - "results_dir": "autotuning_results", - "exps_dir": "autotuning_exps", - "overwrite": true, - "fast": true, - "start_profile_step": 3, - "end_profile_step": 5, - "tuner_type": "gridsearch", - "tuner_early_stopping": 5, - "tuner_num_trials": 50, - "model_info_path": null, - "mp_size": 1, - "max_train_batch_size": null, - "min_train_batch_size": 1, - "max_train_micro_batch_size_per_gpu": 1.024000e+03, - "min_train_micro_batch_size_per_gpu": 1, - "num_tuning_micro_batch_sizes": 3 -} -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] bfloat16_enabled ............. False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] checkpoint_parallel_write_pipeline False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] checkpoint_tag_validation_enabled True -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] checkpoint_tag_validation_fail False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] comms_config ................. -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] communication_data_type ...... None -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] curriculum_enabled_legacy .... False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] curriculum_params_legacy ..... False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] data_efficiency_enabled ...... False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] dataloader_drop_last ......... False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] disable_allgather ............ False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] dump_state ................... False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] dynamic_loss_scale_args ...... None -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] eigenvalue_enabled ........... False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] eigenvalue_gas_boundary_resolution 1 -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] eigenvalue_layer_name ........ bert.encoder.layer -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] eigenvalue_layer_num ......... 0 -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] eigenvalue_max_iter .......... 100 -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] eigenvalue_stability ......... 1e-06 -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] eigenvalue_tol ............... 0.01 -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] eigenvalue_verbose ........... False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] elasticity_enabled ........... False -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] fp16_auto_cast ............... None -[2023-02-21 19:52:14,723] [INFO] [config.py:1013:print] fp16_enabled ................. False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] fp16_master_weights_and_gradients False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] global_rank .................. 0 -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] grad_accum_dtype ............. None -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] gradient_accumulation_steps .. 4 -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] gradient_clipping ............ 1.0 -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] gradient_predivide_factor .... 1.0 -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] initial_dynamic_scale ........ 65536 -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] load_universal_checkpoint .... False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] loss_scale ................... 0 -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] memory_breakdown ............. False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=True, output_path='logs/', job_name='train_neo') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=True -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] nebula_config ................ { - "enabled": false, - "persistent_storage_path": null, - "persistent_time_interval": 100, - "num_of_version_in_retention": 2, - "enable_nebula_load": true, - "load_path": null -} -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] optimizer_legacy_fusion ...... False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] optimizer_name ............... adamw -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] optimizer_params ............. {'lr': 0.0001, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.1} -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] pld_enabled .................. False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] pld_params ................... False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] prescale_gradients ........... False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] scheduler_name ............... WarmupLR -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] scheduler_params ............. {'warmup_min_lr': 1e-09, 'warmup_max_lr': 0.0001, 'warmup_num_steps': 1000} -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] sparse_attention ............. None -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] sparse_gradients_enabled ..... False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] steps_per_print .............. 2000 -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] train_batch_size ............. 192 -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] train_micro_batch_size_per_gpu 6 -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] use_node_local_storage ....... False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] wall_clock_breakdown ......... False -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] world_size ................... 8 -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] zero_allow_untested_optimizer True -[2023-02-21 19:52:14,724] [INFO] [config.py:1013:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False -[2023-02-21 19:52:14,725] [INFO] [config.py:1013:print] zero_enabled ................. True -[2023-02-21 19:52:14,725] [INFO] [config.py:1013:print] zero_optimization_stage ...... 2 -[2023-02-21 19:52:14,725] [INFO] [config.py:998:print_user_config] json = { - "optimizer": { - "type": "AdamW", - "params": { - "lr": 0.0001, - "betas": [0.9, 0.999], - "eps": 1e-08, - "weight_decay": 0.1 - } - }, - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": 1e-09, - "warmup_max_lr": 0.0001, - "warmup_num_steps": 1000 - } - }, - "zero_optimization": { - "stage": 2, - "offload_optimizer": { - "device": "cpu", - "pin_memory": true - }, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "contiguous_gradients": true - }, - "tensorboard": { - "enabled": true, - "output_path": "logs/", - "job_name": "train_neo" - }, - "zero_allow_untested_optimizer": true, - "gradient_accumulation_steps": 4, - "gradient_clipping": 1.0, - "steps_per_print": 2.000000e+03, - "train_batch_size": 192, - "train_micro_batch_size_per_gpu": 6, - "wall_clock_breakdown": false -} -Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.00039505958557128906 seconds -***** Running training ***** - Num examples = 117232 - Num Epochs = 10 - Instantaneous batch size per device = 6 - Total train batch size (w. parallel, distributed & accumulation) = 192 - Gradient Accumulation steps = 4 - Total optimization steps = 6100 - Number of trainable parameters = 125198592 - 0%| | 0/6100 [00:00 - trainer.save_state(trainer_save_dir) -TypeError: save_state() takes 1 positional argument but 2 were given -[2023-02-22 05:05:17,992] [INFO] [launch.py:350:main] Process 11659 exits successfully. -[2023-02-22 05:05:17,993] [INFO] [launch.py:350:main] Process 11661 exits successfully. -[2023-02-22 05:05:17,994] [INFO] [launch.py:350:main] Process 11655 exits successfully. -[2023-02-22 05:05:18,995] [INFO] [launch.py:350:main] Process 11654 exits successfully. -[2023-02-22 05:05:18,996] [INFO] [launch.py:350:main] Process 11656 exits successfully. -[2023-02-22 05:05:18,996] [INFO] [launch.py:350:main] Process 11653 exits successfully. -[2023-02-22 05:05:18,997] [INFO] [launch.py:350:main] Process 11657 exits successfully. -[2023-02-22 05:05:19,998] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 11652 -[2023-02-22 05:05:20,000] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 11653 -[2023-02-22 05:05:20,001] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 11654 -[2023-02-22 05:05:20,001] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 11655 -[2023-02-22 05:05:20,001] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 11656 -[2023-02-22 05:05:20,001] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 11657 -[2023-02-22 05:05:20,001] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 11659 -[2023-02-22 05:05:20,001] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 11661 -[2023-02-22 05:05:20,002] [ERROR] [launch.py:324:sigkill_handler] ['/opt/conda/bin/python3', '-u', 'tune_gpt.py', '--local_rank=7', '--deepspeed', 'deepspeed.json', '--upload-experiment'] exits with return code = 1 -/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. - from pandas import MultiIndex, Int64Index