Upload experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154

Browse files

Files changed (8) hide show

experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/configs.json +25 -0
experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/deepspeed.json +48 -0
experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/final_checkpoint/config.json +54 -0
experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/final_checkpoint/pytorch_model.bin +3 -0
experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/logs/1671446773.5511963/events.out.tfevents.1671446773.7459dae3-a471-4828-975b-f8e23ae7ab31.144520.2 +3 -0
experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/logs/events.out.tfevents.1671446773.7459dae3-a471-4828-975b-f8e23ae7ab31.144520.1 +3 -0
experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/logs/train_neo/events.out.tfevents.1671446763.7459dae3-a471-4828-975b-f8e23ae7ab31.144520.0 +3 -0
experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/output.log +81 -0

experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/configs.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "output_dir": "./results",
+    "evaluation_strategy": "no",
+    "do_eval": false,
+    "eval_steps": 0,
+    "log_level": "error",
+    "logging_first_step": true,
+    "logging_steps": 5,
+    "logging_dir": "./logs",
+    "save_steps": 200,
+    "save_total_limit": 2,
+    "num_train_epochs": 5,
+    "per_device_train_batch_size": 6,
+    "optim": "adamw_torch",
+    "dataloader_drop_last": true,
+    "warmup_steps": 500,
+    "weight_decay": 0.05,
+    "learning_rate": 5e-05,
+    "local_rank": -1,
+    "deepspeed": "deepspeed.json",
+    "total_gpus": 2,
+    "v_cpus": 16,
+    "total_memory_in_gb": 48190.40625,
+    "dataset_limit": 10
+}

experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/deepspeed.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    },
+    "tensorboard": {
+        "enabled": true,
+        "output_path": "logs/",
+        "job_name": "train_neo"
+    },
+    "zero_allow_untested_optimizer": true,
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}

experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/final_checkpoint/config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "_name_or_path": "EleutherAI/gpt-neo-125M",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPTNeoForCausalLM"
+  ],
+  "attention_dropout": 0,
+  "attention_layers": [
+    "global",
+    "local",
+    "global",
+    "local",
+    "global",
+    "local",
+    "global",
+    "local",
+    "global",
+    "local",
+    "global",
+    "local"
+  ],
+  "attention_types": [
+    [
+      [
+        "global",
+        "local"
+      ],
+      6
+    ]
+  ],
+  "bos_token_id": 50256,
+  "embed_dropout": 0,
+  "eos_token_id": 50256,
+  "gradient_checkpointing": false,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "gpt_neo",
+  "num_heads": 12,
+  "num_layers": 12,
+  "resid_dropout": 0,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1",
+  "use_cache": true,
+  "vocab_size": 50257,
+  "window_size": 256
+}

experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/final_checkpoint/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cde597b4e4371138cd8adebf86513a8ce07063c528179c0ff6330eec2445d28c
+size 551154684

experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/logs/1671446773.5511963/events.out.tfevents.1671446773.7459dae3-a471-4828-975b-f8e23ae7ab31.144520.2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5060c28b0d5829c518e342be88110a091f75478b7ad87737f11a66b34dd0c45
+size 5457

experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/logs/events.out.tfevents.1671446773.7459dae3-a471-4828-975b-f8e23ae7ab31.144520.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:874b6b4200d756fc7fdf469432978b3ee219e37e458298358aad97a799ba509e
+size 6266

experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/logs/train_neo/events.out.tfevents.1671446763.7459dae3-a471-4828-975b-f8e23ae7ab31.144520.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a83957c6b655d9a4f65836a5b53c5bd8bbb3ab86b44400e2035d689b89de85d
+size 6507

experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/output.log ADDED Viewed

@@ -0,0 +1,81 @@
  0%|          | 0/55 [00:00<?, ?it/s]
  2%|▏         | 1/55 [00:01<01:13,  1.37s/it]
  2%|▏         | 1/55 [00:01<01:13,  1.37s/it]
  4%|▎         | 2/55 [00:02<01:13,  1.39s/it]
  5%|▌         | 3/55 [00:04<01:13,  1.42s/it]
  7%|▋         | 4/55 [00:05<01:12,  1.42s/it]
  9%|▉         | 5/55 [00:07<01:10,  1.41s/it]
  9%|▉         | 5/55 [00:07<01:10,  1.41s/it]
 11%|█         | 6/55 [00:08<01:09,  1.41s/it]
 13%|█▎        | 7/55 [00:09<01:07,  1.41s/it]
 15%|█▍        | 8/55 [00:11<01:06,  1.41s/it]
 16%|█▋        | 9/55 [00:12<01:04,  1.41s/it]
 18%|█▊        | 10/55 [00:14<01:03,  1.41s/it]
 18%|█▊        | 10/55 [00:14<01:03,  1.41s/it]
 20%|██        | 11/55 [00:15<01:02,  1.41s/it]
 22%|██▏       | 12/55 [00:16<01:00,  1.41s/it]
 24%|██▎       | 13/55 [00:18<00:59,  1.41s/it]
 25%|██▌       | 14/55 [00:19<00:57,  1.41s/it]
 27%|██▋       | 15/55 [00:21<00:56,  1.41s/it]
 27%|██▋       | 15/55 [00:21<00:56,  1.41s/it]
 29%|██▉       | 16/55 [00:22<00:55,  1.41s/it]
 31%|███       | 17/55 [00:23<00:53,  1.41s/it]
 33%|███▎      | 18/55 [00:25<00:52,  1.41s/it]
 35%|███▍      | 19/55 [00:26<00:50,  1.41s/it]
 36%|███▋      | 20/55 [00:28<00:49,  1.41s/it]
 36%|███▋      | 20/55 [00:28<00:49,  1.41s/it]
 38%|███▊      | 21/55 [00:29<00:48,  1.41s/it]
 40%|████      | 22/55 [00:31<00:46,  1.41s/it]
 42%|████▏     | 23/55 [00:32<00:45,  1.41s/it]
 44%|████▎     | 24/55 [00:33<00:43,  1.41s/it]
 45%|████▌     | 25/55 [00:35<00:42,  1.41s/it]
 45%|████▌     | 25/55 [00:35<00:42,  1.41s/it]
 47%|████▋     | 26/55 [00:36<00:40,  1.40s/it]
 49%|████▉     | 27/55 [00:38<00:39,  1.40s/it]
 51%|█████     | 28/55 [00:39<00:37,  1.40s/it]
 53%|█████▎    | 29/55 [00:40<00:36,  1.40s/it]
 55%|█████▍    | 30/55 [00:42<00:35,  1.41s/it]
 55%|█████▍    | 30/55 [00:42<00:35,  1.41s/it]
 56%|█████▋    | 31/55 [00:43<00:33,  1.41s/it]
 58%|█████▊    | 32/55 [00:45<00:32,  1.41s/it]
 60%|██████    | 33/55 [00:46<00:30,  1.41s/it]
 62%|██████▏   | 34/55 [00:47<00:29,  1.41s/it]
 64%|██████▎   | 35/55 [00:49<00:28,  1.41s/it]
 64%|██████▎   | 35/55 [00:49<00:28,  1.41s/it]
 65%|██████▌   | 36/55 [00:50<00:26,  1.41s/it]
 67%|██████▋   | 37/55 [00:52<00:25,  1.42s/it]
 69%|██████▉   | 38/55 [00:53<00:24,  1.42s/it]
 71%|███████   | 39/55 [00:55<00:22,  1.42s/it]
 73%|███████▎  | 40/55 [00:56<00:21,  1.42s/it]
 73%|███████▎  | 40/55 [00:56<00:21,  1.42s/it]
 75%|███████▍  | 41/55 [00:57<00:19,  1.42s/it]
 76%|███████▋  | 42/55 [00:59<00:18,  1.42s/it]
 78%|███████▊  | 43/55 [01:00<00:17,  1.42s/it]
 80%|████████  | 44/55 [01:02<00:15,  1.42s/it]
 82%|████████▏ | 45/55 [01:03<00:14,  1.42s/it]
 82%|████████▏ | 45/55 [01:03<00:14,  1.42s/it]
 84%|████████▎ | 46/55 [01:04<00:12,  1.41s/it]
 85%|████████▌ | 47/55 [01:06<00:11,  1.41s/it]
 87%|████████▋ | 48/55 [01:07<00:09,  1.41s/it]
 89%|████████▉ | 49/55 [01:09<00:08,  1.41s/it]
 91%|█████████ | 50/55 [01:10<00:07,  1.41s/it]
 91%|█████████ | 50/55 [01:10<00:07,  1.41s/it]
 93%|█████████▎| 51/55 [01:11<00:05,  1.42s/it]
 95%|█████████▍| 52/55 [01:13<00:04,  1.42s/it]
 96%|█████████▋| 53/55 [01:14<00:02,  1.42s/it]
 98%|█████████▊| 54/55 [01:16<00:01,  1.42s/it]

+nohup: ignoring input
+[2022-12-19 10:45:52,072] [WARNING] [runner.py:179:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
+[2022-12-19 10:45:52,087] [INFO] [runner.py:508:main] cmd = /usr/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 tune_gpt.py --deepspeed deepspeed.json --limit=10 --local_rank=-1
+[2022-12-19 10:45:53,665] [INFO] [launch.py:142:main] WORLD INFO DICT: {'localhost': [0, 1]}
+[2022-12-19 10:45:53,665] [INFO] [launch.py:148:main] nnodes=1, num_local_procs=2, node_rank=0
+[2022-12-19 10:45:53,665] [INFO] [launch.py:161:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
+[2022-12-19 10:45:53,665] [INFO] [launch.py:162:main] dist_world_size=2
+[2022-12-19 10:45:53,665] [INFO] [launch.py:164:main] Setting CUDA_VISIBLE_DEVICES=0,1
+No config specified, defaulting to: apps/all
+Found cached dataset apps (/home/user/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5)
+Max length: 2048
+PyTorch: setting up devices
+[2022-12-19 10:46:03,625] [INFO] [comm.py:654:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+No config specified, defaulting to: apps/all
+Found cached dataset apps (/home/user/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5)
+Max length: 2048
+PyTorch: setting up devices
+The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
+GPU memory occupied: 3108 MB.
+The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
+GPU memory occupied: 3108 MB.
+Using /home/user/.cache/torch_extensions/py38_cu116 as PyTorch extensions root...
+Using /home/user/.cache/torch_extensions/py38_cu116 as PyTorch extensions root...
+Detected CUDA files, patching ldflags
+Emitting ninja build file /home/user/.cache/torch_extensions/py38_cu116/cpu_adam/build.ninja...
+Building extension module cpu_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 2.6374411582946777 seconds
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 2.604843854904175 seconds
+Using /home/user/.cache/torch_extensions/py38_cu116 as PyTorch extensions root...
+Using /home/user/.cache/torch_extensions/py38_cu116 as PyTorch extensions root...
+Emitting ninja build file /home/user/.cache/torch_extensions/py38_cu116/utils/build.ninja...
+Building extension module utils...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module utils...
+Time to load utils op: 0.2902805805206299 seconds
+Loading extension module utils...
+Time to load utils op: 0.2025001049041748 seconds
+Rank: 0 partition count [2] and sizes[(62599296, False)]
+Rank: 1 partition count [2] and sizes[(62599296, False)]
+Using /home/user/.cache/torch_extensions/py38_cu116 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0009868144989013672 seconds
+Using /home/user/.cache/torch_extensions/py38_cu116 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0003800392150878906 seconds
  0%|          | 0/55 [00:00<?, ?it/s]
  2%|▏         | 1/55 [00:01<01:13,  1.37s/it]
  2%|▏         | 1/55 [00:01<01:13,  1.37s/it]
  4%|▎         | 2/55 [00:02<01:13,  1.39s/it]
  5%|▌         | 3/55 [00:04<01:13,  1.42s/it]
  7%|▋         | 4/55 [00:05<01:12,  1.42s/it]
  9%|▉         | 5/55 [00:07<01:10,  1.41s/it]
  9%|▉         | 5/55 [00:07<01:10,  1.41s/it]
 11%|█         | 6/55 [00:08<01:09,  1.41s/it]
 13%|█▎        | 7/55 [00:09<01:07,  1.41s/it]
 15%|█▍        | 8/55 [00:11<01:06,  1.41s/it]
 16%|█▋        | 9/55 [00:12<01:04,  1.41s/it]
 18%|█▊        | 10/55 [00:14<01:03,  1.41s/it]
 18%|█▊        | 10/55 [00:14<01:03,  1.41s/it]
 20%|██        | 11/55 [00:15<01:02,  1.41s/it]
 22%|██▏       | 12/55 [00:16<01:00,  1.41s/it]
 24%|██▎       | 13/55 [00:18<00:59,  1.41s/it]
 25%|██▌       | 14/55 [00:19<00:57,  1.41s/it]
 27%|██▋       | 15/55 [00:21<00:56,  1.41s/it]
 27%|██▋       | 15/55 [00:21<00:56,  1.41s/it]
 29%|██▉       | 16/55 [00:22<00:55,  1.41s/it]
 31%|███       | 17/55 [00:23<00:53,  1.41s/it]
 33%|███▎      | 18/55 [00:25<00:52,  1.41s/it]
 35%|███▍      | 19/55 [00:26<00:50,  1.41s/it]
 36%|███▋      | 20/55 [00:28<00:49,  1.41s/it]
 36%|███▋      | 20/55 [00:28<00:49,  1.41s/it]
 38%|███▊      | 21/55 [00:29<00:48,  1.41s/it]
 40%|████      | 22/55 [00:31<00:46,  1.41s/it]
 42%|████▏     | 23/55 [00:32<00:45,  1.41s/it]
 44%|████▎     | 24/55 [00:33<00:43,  1.41s/it]
 45%|████▌     | 25/55 [00:35<00:42,  1.41s/it]
 45%|████▌     | 25/55 [00:35<00:42,  1.41s/it]
 47%|████▋     | 26/55 [00:36<00:40,  1.40s/it]
 49%|████▉     | 27/55 [00:38<00:39,  1.40s/it]
 51%|█████     | 28/55 [00:39<00:37,  1.40s/it]
 53%|█████▎    | 29/55 [00:40<00:36,  1.40s/it]
 55%|█████▍    | 30/55 [00:42<00:35,  1.41s/it]
 55%|█████▍    | 30/55 [00:42<00:35,  1.41s/it]
 56%|█████▋    | 31/55 [00:43<00:33,  1.41s/it]
 58%|█████▊    | 32/55 [00:45<00:32,  1.41s/it]
 60%|██████    | 33/55 [00:46<00:30,  1.41s/it]
 62%|██████▏   | 34/55 [00:47<00:29,  1.41s/it]
 64%|██████▎   | 35/55 [00:49<00:28,  1.41s/it]
 64%|██████▎   | 35/55 [00:49<00:28,  1.41s/it]
 65%|██████▌   | 36/55 [00:50<00:26,  1.41s/it]
 67%|██████▋   | 37/55 [00:52<00:25,  1.42s/it]
 69%|██████▉   | 38/55 [00:53<00:24,  1.42s/it]
 71%|███████   | 39/55 [00:55<00:22,  1.42s/it]
 73%|███████▎  | 40/55 [00:56<00:21,  1.42s/it]
 73%|███████▎  | 40/55 [00:56<00:21,  1.42s/it]
 75%|███████▍  | 41/55 [00:57<00:19,  1.42s/it]
 76%|███████▋  | 42/55 [00:59<00:18,  1.42s/it]
 78%|███████▊  | 43/55 [01:00<00:17,  1.42s/it]
 80%|████████  | 44/55 [01:02<00:15,  1.42s/it]
 82%|████████▏ | 45/55 [01:03<00:14,  1.42s/it]
 82%|████████▏ | 45/55 [01:03<00:14,  1.42s/it]
 84%|████████▎ | 46/55 [01:04<00:12,  1.41s/it]
 85%|████████▌ | 47/55 [01:06<00:11,  1.41s/it]
 87%|████████▋ | 48/55 [01:07<00:09,  1.41s/it]
 89%|████████▉ | 49/55 [01:09<00:08,  1.41s/it]
 91%|█████████ | 50/55 [01:10<00:07,  1.41s/it]
 91%|█████████ | 50/55 [01:10<00:07,  1.41s/it]
 93%|█████████▎| 51/55 [01:11<00:05,  1.42s/it]
 95%|█████████▍| 52/55 [01:13<00:04,  1.42s/it]
 96%|█████████▋| 53/55 [01:14<00:02,  1.42s/it]
 98%|█████████▊| 54/55 [01:16<00:01,  1.42s/it]
+Samples/second: 8.86
+Time: 77.67
+Samples/second: 8.88
+GPU memory occupied: 44704 MB.
+Traceback (most recent call last):
+  File "tune_gpt.py", line 223, in <module>
+    shutil.move(os.path.join(pwd_path, "output.log"), os.path.join(final_save_dir))
+  File "/usr/lib/python3.8/shutil.py", line 789, in move
+    raise Error("Destination path '%s' already exists" % real_dst)
+shutil.Error: Destination path 'experiments/2022-12-19-ab8f3a39c84fea7f66bf71860384bbce5df5fb3523e7dabd22b35c3ecfefb154/output.log' already exists
+[2022-12-19 10:47:33,785] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 144520
+[2022-12-19 10:47:33,786] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 144521
+[2022-12-19 10:47:33,793] [ERROR] [launch.py:324:sigkill_handler] ['/usr/bin/python3', '-u', 'tune_gpt.py', '--local_rank=1', '--deepspeed', 'deepspeed.json', '--limit=10', '--local_rank=-1'] exits with return code = 1