Training in progress, step 8000, checkpoint

Browse files

Files changed (13) hide show

checkpoint-8000/added_tokens.json +4 -0
checkpoint-8000/config.json +92 -0
checkpoint-8000/generation_config.json +9 -0
checkpoint-8000/model.safetensors +3 -0
checkpoint-8000/optimizer.pt +3 -0
checkpoint-8000/preprocessor_config.json +19 -0
checkpoint-8000/rng_state.pth +3 -0
checkpoint-8000/scheduler.pt +3 -0
checkpoint-8000/special_tokens_map.json +13 -0
checkpoint-8000/spm_char.model +3 -0
checkpoint-8000/tokenizer_config.json +63 -0
checkpoint-8000/trainer_state.json +1281 -0
checkpoint-8000/training_args.bin +3 -0

checkpoint-8000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<ctc_blank>": 80,
+  "<mask>": 79
+}

checkpoint-8000/config.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+  "_name_or_path": "microsoft/speecht5_tts",
+  "activation_dropout": 0.1,
+  "apply_spec_augment": true,
+  "architectures": [
+    "SpeechT5ForTextToSpeech"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 0,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.1,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 2,
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.1,
+  "encoder_layers": 12,
+  "encoder_max_relative_position": 160,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.0,
+  "guided_attention_loss_num_heads": 2,
+  "guided_attention_loss_scale": 10.0,
+  "guided_attention_loss_sigma": 0.4,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "is_encoder_decoder": true,
+  "layer_norm_eps": 1e-05,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": 1876,
+  "max_speech_positions": 1876,
+  "max_text_positions": 600,
+  "model_type": "speecht5",
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_mel_bins": 80,
+  "pad_token_id": 1,
+  "positional_dropout": 0.1,
+  "reduction_factor": 2,
+  "scale_embedding": false,
+  "speaker_embedding_dim": 512,
+  "speech_decoder_postnet_dropout": 0.5,
+  "speech_decoder_postnet_kernel": 5,
+  "speech_decoder_postnet_layers": 5,
+  "speech_decoder_postnet_units": 256,
+  "speech_decoder_prenet_dropout": 0.5,
+  "speech_decoder_prenet_layers": 2,
+  "speech_decoder_prenet_units": 256,
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.0.dev0",
+  "use_cache": false,
+  "use_guided_attention_loss": true,
+  "vocab_size": 81
+}

checkpoint-8000/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "eos_token_id": 2,
+  "max_length": 1876,
+  "pad_token_id": 1,
+  "transformers_version": "4.41.0.dev0"
+}

checkpoint-8000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a138a16b7718b5c0152abfec1485396118c13a434e3bf29eb5310cc23c32fb2f
+size 577789320

checkpoint-8000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70610f592572c0414a97a80dab895c8f139e41227519fe2ad5a119c7d0d9f447
+size 1155772233

checkpoint-8000/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "do_normalize": false,
+  "feature_extractor_type": "SpeechT5FeatureExtractor",
+  "feature_size": 1,
+  "fmax": 7600,
+  "fmin": 80,
+  "frame_signal_scale": 1.0,
+  "hop_length": 16,
+  "mel_floor": 1e-10,
+  "num_mel_bins": 80,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "SpeechT5Processor",
+  "reduction_factor": 2,
+  "return_attention_mask": true,
+  "sampling_rate": 16000,
+  "win_function": "hann_window",
+  "win_length": 64
+}

checkpoint-8000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f1602c1de8f7f1059463b72c682668262463165604b0c05b91bd9f1000f4dfa
+size 14244

checkpoint-8000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:797161cb36b8434a2e9b424b1551b1d663e1ac4112ca3f634c3c132aead6996b
+size 1064

checkpoint-8000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

checkpoint-8000/spm_char.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
+size 238473

checkpoint-8000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "79": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "80": {
+      "content": "<ctc_blank>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 600,
+  "normalize": false,
+  "pad_token": "<pad>",
+  "processor_class": "SpeechT5Processor",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "SpeechT5Tokenizer",
+  "unk_token": "<unk>"
+}

checkpoint-8000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1281 @@

+{
+  "best_metric": 0.3261738121509552,
+  "best_model_checkpoint": "mikhail-panzo/zlm_b32_le4_s8000/checkpoint-8000",
+  "epoch": 3.350785340314136,
+  "eval_steps": 500,
+  "global_step": 8000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.020942408376963352,
+      "grad_norm": 3.1775436401367188,
+      "learning_rate": 2.35e-06,
+      "loss": 1.1228,
+      "step": 50
+    },
+    {
+      "epoch": 0.041884816753926704,
+      "grad_norm": 4.18980073928833,
+      "learning_rate": 4.85e-06,
+      "loss": 0.8304,
+      "step": 100
+    },
+    {
+      "epoch": 0.06282722513089005,
+      "grad_norm": 4.5355024337768555,
+      "learning_rate": 7.35e-06,
+      "loss": 0.7701,
+      "step": 150
+    },
+    {
+      "epoch": 0.08376963350785341,
+      "grad_norm": 3.9590957164764404,
+      "learning_rate": 9.85e-06,
+      "loss": 0.7096,
+      "step": 200
+    },
+    {
+      "epoch": 0.10471204188481675,
+      "grad_norm": 5.419675350189209,
+      "learning_rate": 1.235e-05,
+      "loss": 0.6518,
+      "step": 250
+    },
+    {
+      "epoch": 0.1256544502617801,
+      "grad_norm": 3.1052777767181396,
+      "learning_rate": 1.485e-05,
+      "loss": 0.6326,
+      "step": 300
+    },
+    {
+      "epoch": 0.14659685863874344,
+      "grad_norm": 3.4649453163146973,
+      "learning_rate": 1.7349999999999998e-05,
+      "loss": 0.6241,
+      "step": 350
+    },
+    {
+      "epoch": 0.16753926701570682,
+      "grad_norm": 4.962624549865723,
+      "learning_rate": 1.985e-05,
+      "loss": 0.5723,
+      "step": 400
+    },
+    {
+      "epoch": 0.18848167539267016,
+      "grad_norm": 2.7586095333099365,
+      "learning_rate": 2.235e-05,
+      "loss": 0.5764,
+      "step": 450
+    },
+    {
+      "epoch": 0.2094240837696335,
+      "grad_norm": 3.5610299110412598,
+      "learning_rate": 2.485e-05,
+      "loss": 0.5552,
+      "step": 500
+    },
+    {
+      "epoch": 0.2094240837696335,
+      "eval_loss": 0.48825809359550476,
+      "eval_runtime": 277.5892,
+      "eval_samples_per_second": 30.581,
+      "eval_steps_per_second": 3.826,
+      "step": 500
+    },
+    {
+      "epoch": 0.23036649214659685,
+      "grad_norm": 3.3351809978485107,
+      "learning_rate": 2.7350000000000004e-05,
+      "loss": 0.5322,
+      "step": 550
+    },
+    {
+      "epoch": 0.2513089005235602,
+      "grad_norm": 3.7222373485565186,
+      "learning_rate": 2.985e-05,
+      "loss": 0.5186,
+      "step": 600
+    },
+    {
+      "epoch": 0.27225130890052357,
+      "grad_norm": 3.1811156272888184,
+      "learning_rate": 3.235e-05,
+      "loss": 0.5157,
+      "step": 650
+    },
+    {
+      "epoch": 0.2931937172774869,
+      "grad_norm": 2.2585642337799072,
+      "learning_rate": 3.485e-05,
+      "loss": 0.5089,
+      "step": 700
+    },
+    {
+      "epoch": 0.31413612565445026,
+      "grad_norm": 3.999460220336914,
+      "learning_rate": 3.735e-05,
+      "loss": 0.5117,
+      "step": 750
+    },
+    {
+      "epoch": 0.33507853403141363,
+      "grad_norm": 3.739990472793579,
+      "learning_rate": 3.9850000000000006e-05,
+      "loss": 0.5031,
+      "step": 800
+    },
+    {
+      "epoch": 0.35602094240837695,
+      "grad_norm": 4.251980781555176,
+      "learning_rate": 4.235e-05,
+      "loss": 0.5064,
+      "step": 850
+    },
+    {
+      "epoch": 0.3769633507853403,
+      "grad_norm": 2.770602226257324,
+      "learning_rate": 4.4850000000000006e-05,
+      "loss": 0.4921,
+      "step": 900
+    },
+    {
+      "epoch": 0.39790575916230364,
+      "grad_norm": 2.506974220275879,
+      "learning_rate": 4.735e-05,
+      "loss": 0.4839,
+      "step": 950
+    },
+    {
+      "epoch": 0.418848167539267,
+      "grad_norm": 2.2666189670562744,
+      "learning_rate": 4.9850000000000006e-05,
+      "loss": 0.4913,
+      "step": 1000
+    },
+    {
+      "epoch": 0.418848167539267,
+      "eval_loss": 0.4265913963317871,
+      "eval_runtime": 265.1599,
+      "eval_samples_per_second": 32.015,
+      "eval_steps_per_second": 4.005,
+      "step": 1000
+    },
+    {
+      "epoch": 0.4397905759162304,
+      "grad_norm": 3.1451058387756348,
+      "learning_rate": 5.235e-05,
+      "loss": 0.4722,
+      "step": 1050
+    },
+    {
+      "epoch": 0.4607329842931937,
+      "grad_norm": 3.197997570037842,
+      "learning_rate": 5.485e-05,
+      "loss": 0.4692,
+      "step": 1100
+    },
+    {
+      "epoch": 0.4816753926701571,
+      "grad_norm": 2.9112601280212402,
+      "learning_rate": 5.7350000000000005e-05,
+      "loss": 0.4738,
+      "step": 1150
+    },
+    {
+      "epoch": 0.5026178010471204,
+      "grad_norm": 3.036731243133545,
+      "learning_rate": 5.9850000000000005e-05,
+      "loss": 0.4521,
+      "step": 1200
+    },
+    {
+      "epoch": 0.5235602094240838,
+      "grad_norm": 5.021958351135254,
+      "learning_rate": 6.235000000000001e-05,
+      "loss": 0.4666,
+      "step": 1250
+    },
+    {
+      "epoch": 0.5445026178010471,
+      "grad_norm": 3.302204132080078,
+      "learning_rate": 6.485e-05,
+      "loss": 0.4625,
+      "step": 1300
+    },
+    {
+      "epoch": 0.5654450261780105,
+      "grad_norm": 3.2643635272979736,
+      "learning_rate": 6.735e-05,
+      "loss": 0.4683,
+      "step": 1350
+    },
+    {
+      "epoch": 0.5863874345549738,
+      "grad_norm": 1.7499467134475708,
+      "learning_rate": 6.985e-05,
+      "loss": 0.449,
+      "step": 1400
+    },
+    {
+      "epoch": 0.6073298429319371,
+      "grad_norm": 1.3616622686386108,
+      "learning_rate": 7.235000000000001e-05,
+      "loss": 0.4523,
+      "step": 1450
+    },
+    {
+      "epoch": 0.6282722513089005,
+      "grad_norm": 2.5826191902160645,
+      "learning_rate": 7.485e-05,
+      "loss": 0.446,
+      "step": 1500
+    },
+    {
+      "epoch": 0.6282722513089005,
+      "eval_loss": 0.3975289463996887,
+      "eval_runtime": 266.4222,
+      "eval_samples_per_second": 31.863,
+      "eval_steps_per_second": 3.986,
+      "step": 1500
+    },
+    {
+      "epoch": 0.6492146596858639,
+      "grad_norm": 3.6052303314208984,
+      "learning_rate": 7.735e-05,
+      "loss": 0.4449,
+      "step": 1550
+    },
+    {
+      "epoch": 0.6701570680628273,
+      "grad_norm": 3.4120566844940186,
+      "learning_rate": 7.985e-05,
+      "loss": 0.4477,
+      "step": 1600
+    },
+    {
+      "epoch": 0.6910994764397905,
+      "grad_norm": 2.187040090560913,
+      "learning_rate": 8.235000000000001e-05,
+      "loss": 0.4522,
+      "step": 1650
+    },
+    {
+      "epoch": 0.7120418848167539,
+      "grad_norm": 1.718518853187561,
+      "learning_rate": 8.485e-05,
+      "loss": 0.4431,
+      "step": 1700
+    },
+    {
+      "epoch": 0.7329842931937173,
+      "grad_norm": 1.8248894214630127,
+      "learning_rate": 8.735000000000001e-05,
+      "loss": 0.4496,
+      "step": 1750
+    },
+    {
+      "epoch": 0.7539267015706806,
+      "grad_norm": 2.733355760574341,
+      "learning_rate": 8.985e-05,
+      "loss": 0.4297,
+      "step": 1800
+    },
+    {
+      "epoch": 0.774869109947644,
+      "grad_norm": 2.5899884700775146,
+      "learning_rate": 9.235000000000001e-05,
+      "loss": 0.4588,
+      "step": 1850
+    },
+    {
+      "epoch": 0.7958115183246073,
+      "grad_norm": 1.5704914331436157,
+      "learning_rate": 9.485e-05,
+      "loss": 0.4345,
+      "step": 1900
+    },
+    {
+      "epoch": 0.8167539267015707,
+      "grad_norm": 5.18487548828125,
+      "learning_rate": 9.735000000000001e-05,
+      "loss": 0.4325,
+      "step": 1950
+    },
+    {
+      "epoch": 0.837696335078534,
+      "grad_norm": 2.4392499923706055,
+      "learning_rate": 9.985000000000001e-05,
+      "loss": 0.4222,
+      "step": 2000
+    },
+    {
+      "epoch": 0.837696335078534,
+      "eval_loss": 0.39489272236824036,
+      "eval_runtime": 267.0393,
+      "eval_samples_per_second": 31.789,
+      "eval_steps_per_second": 3.977,
+      "step": 2000
+    },
+    {
+      "epoch": 0.8586387434554974,
+      "grad_norm": 2.1138336658477783,
+      "learning_rate": 9.921666666666668e-05,
+      "loss": 0.4438,
+      "step": 2050
+    },
+    {
+      "epoch": 0.8795811518324608,
+      "grad_norm": 2.3158669471740723,
+      "learning_rate": 9.838333333333334e-05,
+      "loss": 0.421,
+      "step": 2100
+    },
+    {
+      "epoch": 0.900523560209424,
+      "grad_norm": 2.1935179233551025,
+      "learning_rate": 9.755000000000001e-05,
+      "loss": 0.4247,
+      "step": 2150
+    },
+    {
+      "epoch": 0.9214659685863874,
+      "grad_norm": 1.7845231294631958,
+      "learning_rate": 9.671666666666667e-05,
+      "loss": 0.4203,
+      "step": 2200
+    },
+    {
+      "epoch": 0.9424083769633508,
+      "grad_norm": 1.7287964820861816,
+      "learning_rate": 9.588333333333334e-05,
+      "loss": 0.4306,
+      "step": 2250
+    },
+    {
+      "epoch": 0.9633507853403142,
+      "grad_norm": 2.2276289463043213,
+      "learning_rate": 9.505e-05,
+      "loss": 0.4392,
+      "step": 2300
+    },
+    {
+      "epoch": 0.9842931937172775,
+      "grad_norm": 2.248389720916748,
+      "learning_rate": 9.421666666666668e-05,
+      "loss": 0.4304,
+      "step": 2350
+    },
+    {
+      "epoch": 1.0052356020942408,
+      "grad_norm": 1.5984878540039062,
+      "learning_rate": 9.338333333333333e-05,
+      "loss": 0.4125,
+      "step": 2400
+    },
+    {
+      "epoch": 1.0261780104712042,
+      "grad_norm": 2.6061534881591797,
+      "learning_rate": 9.255e-05,
+      "loss": 0.4229,
+      "step": 2450
+    },
+    {
+      "epoch": 1.0471204188481675,
+      "grad_norm": 2.127110481262207,
+      "learning_rate": 9.171666666666667e-05,
+      "loss": 0.4273,
+      "step": 2500
+    },
+    {
+      "epoch": 1.0471204188481675,
+      "eval_loss": 0.38292551040649414,
+      "eval_runtime": 263.8923,
+      "eval_samples_per_second": 32.168,
+      "eval_steps_per_second": 4.024,
+      "step": 2500
+    },
+    {
+      "epoch": 1.068062827225131,
+      "grad_norm": 1.9867545366287231,
+      "learning_rate": 9.088333333333334e-05,
+      "loss": 0.4164,
+      "step": 2550
+    },
+    {
+      "epoch": 1.0890052356020943,
+      "grad_norm": 2.1518051624298096,
+      "learning_rate": 9.005000000000001e-05,
+      "loss": 0.4038,
+      "step": 2600
+    },
+    {
+      "epoch": 1.1099476439790577,
+      "grad_norm": 2.1266932487487793,
+      "learning_rate": 8.921666666666668e-05,
+      "loss": 0.418,
+      "step": 2650
+    },
+    {
+      "epoch": 1.130890052356021,
+      "grad_norm": 1.9351108074188232,
+      "learning_rate": 8.838333333333334e-05,
+      "loss": 0.4213,
+      "step": 2700
+    },
+    {
+      "epoch": 1.1518324607329844,
+      "grad_norm": 1.4189810752868652,
+      "learning_rate": 8.755e-05,
+      "loss": 0.4201,
+      "step": 2750
+    },
+    {
+      "epoch": 1.1727748691099475,
+      "grad_norm": 3.590930223464966,
+      "learning_rate": 8.671666666666667e-05,
+      "loss": 0.4043,
+      "step": 2800
+    },
+    {
+      "epoch": 1.193717277486911,
+      "grad_norm": 3.049971103668213,
+      "learning_rate": 8.588333333333334e-05,
+      "loss": 0.4036,
+      "step": 2850
+    },
+    {
+      "epoch": 1.2146596858638743,
+      "grad_norm": 2.1481425762176514,
+      "learning_rate": 8.505000000000001e-05,
+      "loss": 0.4067,
+      "step": 2900
+    },
+    {
+      "epoch": 1.2356020942408377,
+      "grad_norm": 1.774927020072937,
+      "learning_rate": 8.421666666666666e-05,
+      "loss": 0.4126,
+      "step": 2950
+    },
+    {
+      "epoch": 1.256544502617801,
+      "grad_norm": 1.624089241027832,
+      "learning_rate": 8.338333333333333e-05,
+      "loss": 0.4028,
+      "step": 3000
+    },
+    {
+      "epoch": 1.256544502617801,
+      "eval_loss": 0.3674115538597107,
+      "eval_runtime": 263.8647,
+      "eval_samples_per_second": 32.172,
+      "eval_steps_per_second": 4.025,
+      "step": 3000
+    },
+    {
+      "epoch": 1.2774869109947644,
+      "grad_norm": 1.8440332412719727,
+      "learning_rate": 8.255e-05,
+      "loss": 0.4026,
+      "step": 3050
+    },
+    {
+      "epoch": 1.2984293193717278,
+      "grad_norm": 2.144713878631592,
+      "learning_rate": 8.171666666666667e-05,
+      "loss": 0.4007,
+      "step": 3100
+    },
+    {
+      "epoch": 1.3193717277486912,
+      "grad_norm": 1.9491198062896729,
+      "learning_rate": 8.088333333333334e-05,
+      "loss": 0.4087,
+      "step": 3150
+    },
+    {
+      "epoch": 1.3403141361256545,
+      "grad_norm": 2.0903196334838867,
+      "learning_rate": 8.005000000000001e-05,
+      "loss": 0.4034,
+      "step": 3200
+    },
+    {
+      "epoch": 1.3612565445026177,
+      "grad_norm": 2.226724863052368,
+      "learning_rate": 7.921666666666668e-05,
+      "loss": 0.4106,
+      "step": 3250
+    },
+    {
+      "epoch": 1.3821989528795813,
+      "grad_norm": 1.8056219816207886,
+      "learning_rate": 7.838333333333335e-05,
+      "loss": 0.3984,
+      "step": 3300
+    },
+    {
+      "epoch": 1.4031413612565444,
+      "grad_norm": 1.8196921348571777,
+      "learning_rate": 7.755e-05,
+      "loss": 0.4018,
+      "step": 3350
+    },
+    {
+      "epoch": 1.4240837696335078,
+      "grad_norm": 1.7930132150650024,
+      "learning_rate": 7.671666666666667e-05,
+      "loss": 0.395,
+      "step": 3400
+    },
+    {
+      "epoch": 1.4450261780104712,
+      "grad_norm": 2.525432586669922,
+      "learning_rate": 7.588333333333334e-05,
+      "loss": 0.399,
+      "step": 3450
+    },
+    {
+      "epoch": 1.4659685863874345,
+      "grad_norm": 1.2433278560638428,
+      "learning_rate": 7.505e-05,
+      "loss": 0.3941,
+      "step": 3500
+    },
+    {
+      "epoch": 1.4659685863874345,
+      "eval_loss": 0.3616171181201935,
+      "eval_runtime": 265.4304,
+      "eval_samples_per_second": 31.982,
+      "eval_steps_per_second": 4.001,
+      "step": 3500
+    },
+    {
+      "epoch": 1.486910994764398,
+      "grad_norm": 1.859680414199829,
+      "learning_rate": 7.421666666666666e-05,
+      "loss": 0.4023,
+      "step": 3550
+    },
+    {
+      "epoch": 1.5078534031413613,
+      "grad_norm": 1.9057739973068237,
+      "learning_rate": 7.338333333333333e-05,
+      "loss": 0.3938,
+      "step": 3600
+    },
+    {
+      "epoch": 1.5287958115183247,
+      "grad_norm": 1.6138437986373901,
+      "learning_rate": 7.255e-05,
+      "loss": 0.3995,
+      "step": 3650
+    },
+    {
+      "epoch": 1.5497382198952878,
+      "grad_norm": 1.9373960494995117,
+      "learning_rate": 7.171666666666667e-05,
+      "loss": 0.3934,
+      "step": 3700
+    },
+    {
+      "epoch": 1.5706806282722514,
+      "grad_norm": 1.7537634372711182,
+      "learning_rate": 7.088333333333334e-05,
+      "loss": 0.3969,
+      "step": 3750
+    },
+    {
+      "epoch": 1.5916230366492146,
+      "grad_norm": 2.310279607772827,
+      "learning_rate": 7.005000000000001e-05,
+      "loss": 0.3967,
+      "step": 3800
+    },
+    {
+      "epoch": 1.6125654450261782,
+      "grad_norm": 3.323341131210327,
+      "learning_rate": 6.921666666666668e-05,
+      "loss": 0.4058,
+      "step": 3850
+    },
+    {
+      "epoch": 1.6335078534031413,
+      "grad_norm": 1.784133791923523,
+      "learning_rate": 6.838333333333333e-05,
+      "loss": 0.3898,
+      "step": 3900
+    },
+    {
+      "epoch": 1.6544502617801047,
+      "grad_norm": 1.8042694330215454,
+      "learning_rate": 6.755e-05,
+      "loss": 0.3902,
+      "step": 3950
+    },
+    {
+      "epoch": 1.675392670157068,
+      "grad_norm": 1.8511914014816284,
+      "learning_rate": 6.671666666666667e-05,
+      "loss": 0.3871,
+      "step": 4000
+    },
+    {
+      "epoch": 1.675392670157068,
+      "eval_loss": 0.3518759310245514,
+      "eval_runtime": 268.3666,
+      "eval_samples_per_second": 31.632,
+      "eval_steps_per_second": 3.957,
+      "step": 4000
+    },
+    {
+      "epoch": 1.6963350785340314,
+      "grad_norm": 1.1159111261367798,
+      "learning_rate": 6.588333333333334e-05,
+      "loss": 0.3842,
+      "step": 4050
+    },
+    {
+      "epoch": 1.7172774869109948,
+      "grad_norm": 1.8107820749282837,
+      "learning_rate": 6.505e-05,
+      "loss": 0.3909,
+      "step": 4100
+    },
+    {
+      "epoch": 1.738219895287958,
+      "grad_norm": 1.5878946781158447,
+      "learning_rate": 6.421666666666666e-05,
+      "loss": 0.3974,
+      "step": 4150
+    },
+    {
+      "epoch": 1.7591623036649215,
+      "grad_norm": 3.3478894233703613,
+      "learning_rate": 6.338333333333333e-05,
+      "loss": 0.3874,
+      "step": 4200
+    },
+    {
+      "epoch": 1.7801047120418847,
+      "grad_norm": 2.9195618629455566,
+      "learning_rate": 6.255e-05,
+      "loss": 0.3893,
+      "step": 4250
+    },
+    {
+      "epoch": 1.8010471204188483,
+      "grad_norm": 1.6017109155654907,
+      "learning_rate": 6.171666666666667e-05,
+      "loss": 0.3827,
+      "step": 4300
+    },
+    {
+      "epoch": 1.8219895287958114,
+      "grad_norm": 2.468721628189087,
+      "learning_rate": 6.0883333333333334e-05,
+      "loss": 0.3835,
+      "step": 4350
+    },
+    {
+      "epoch": 1.8429319371727748,
+      "grad_norm": 1.5800402164459229,
+      "learning_rate": 6.005000000000001e-05,
+      "loss": 0.3767,
+      "step": 4400
+    },
+    {
+      "epoch": 1.8638743455497382,
+      "grad_norm": 2.512275218963623,
+      "learning_rate": 5.9216666666666665e-05,
+      "loss": 0.3835,
+      "step": 4450
+    },
+    {
+      "epoch": 1.8848167539267016,
+      "grad_norm": 1.4855787754058838,
+      "learning_rate": 5.8383333333333334e-05,
+      "loss": 0.3828,
+      "step": 4500
+    },
+    {
+      "epoch": 1.8848167539267016,
+      "eval_loss": 0.3493475914001465,
+      "eval_runtime": 265.6954,
+      "eval_samples_per_second": 31.95,
+      "eval_steps_per_second": 3.997,
+      "step": 4500
+    },
+    {
+      "epoch": 1.905759162303665,
+      "grad_norm": 1.5831927061080933,
+      "learning_rate": 5.755e-05,
+      "loss": 0.3798,
+      "step": 4550
+    },
+    {
+      "epoch": 1.9267015706806283,
+      "grad_norm": 1.8235031366348267,
+      "learning_rate": 5.671666666666667e-05,
+      "loss": 0.3772,
+      "step": 4600
+    },
+    {
+      "epoch": 1.9476439790575917,
+      "grad_norm": 2.0443477630615234,
+      "learning_rate": 5.5883333333333334e-05,
+      "loss": 0.3818,
+      "step": 4650
+    },
+    {
+      "epoch": 1.9685863874345548,
+      "grad_norm": 1.5491324663162231,
+      "learning_rate": 5.505e-05,
+      "loss": 0.376,
+      "step": 4700
+    },
+    {
+      "epoch": 1.9895287958115184,
+      "grad_norm": 1.934651494026184,
+      "learning_rate": 5.421666666666667e-05,
+      "loss": 0.3762,
+      "step": 4750
+    },
+    {
+      "epoch": 2.0104712041884816,
+      "grad_norm": 1.5579476356506348,
+      "learning_rate": 5.338333333333334e-05,
+      "loss": 0.3899,
+      "step": 4800
+    },
+    {
+      "epoch": 2.031413612565445,
+      "grad_norm": 3.049567461013794,
+      "learning_rate": 5.255e-05,
+      "loss": 0.379,
+      "step": 4850
+    },
+    {
+      "epoch": 2.0523560209424083,
+      "grad_norm": 2.302692174911499,
+      "learning_rate": 5.1716666666666666e-05,
+      "loss": 0.376,
+      "step": 4900
+    },
+    {
+      "epoch": 2.073298429319372,
+      "grad_norm": 1.2674555778503418,
+      "learning_rate": 5.088333333333334e-05,
+      "loss": 0.3804,
+      "step": 4950
+    },
+    {
+      "epoch": 2.094240837696335,
+      "grad_norm": 4.3979902267456055,
+      "learning_rate": 5.005e-05,
+      "loss": 0.3954,
+      "step": 5000
+    },
+    {
+      "epoch": 2.094240837696335,
+      "eval_loss": 0.3490452766418457,
+      "eval_runtime": 267.7969,
+      "eval_samples_per_second": 31.699,
+      "eval_steps_per_second": 3.966,
+      "step": 5000
+    },
+    {
+      "epoch": 2.115183246073298,
+      "grad_norm": 1.4126474857330322,
+      "learning_rate": 4.9216666666666666e-05,
+      "loss": 0.3814,
+      "step": 5050
+    },
+    {
+      "epoch": 2.136125654450262,
+      "grad_norm": 2.161151885986328,
+      "learning_rate": 4.8383333333333335e-05,
+      "loss": 0.3796,
+      "step": 5100
+    },
+    {
+      "epoch": 2.157068062827225,
+      "grad_norm": 1.7521085739135742,
+      "learning_rate": 4.755e-05,
+      "loss": 0.3796,
+      "step": 5150
+    },
+    {
+      "epoch": 2.1780104712041886,
+      "grad_norm": 1.831732988357544,
+      "learning_rate": 4.671666666666667e-05,
+      "loss": 0.3745,
+      "step": 5200
+    },
+    {
+      "epoch": 2.1989528795811517,
+      "grad_norm": 1.640143871307373,
+      "learning_rate": 4.5883333333333335e-05,
+      "loss": 0.3687,
+      "step": 5250
+    },
+    {
+      "epoch": 2.2198952879581153,
+      "grad_norm": 1.7855136394500732,
+      "learning_rate": 4.5050000000000004e-05,
+      "loss": 0.3737,
+      "step": 5300
+    },
+    {
+      "epoch": 2.2408376963350785,
+      "grad_norm": 2.1165611743927,
+      "learning_rate": 4.4216666666666666e-05,
+      "loss": 0.3681,
+      "step": 5350
+    },
+    {
+      "epoch": 2.261780104712042,
+      "grad_norm": 2.91493558883667,
+      "learning_rate": 4.3383333333333335e-05,
+      "loss": 0.3775,
+      "step": 5400
+    },
+    {
+      "epoch": 2.282722513089005,
+      "grad_norm": 3.0871639251708984,
+      "learning_rate": 4.2550000000000004e-05,
+      "loss": 0.3714,
+      "step": 5450
+    },
+    {
+      "epoch": 2.303664921465969,
+      "grad_norm": 1.933838129043579,
+      "learning_rate": 4.171666666666667e-05,
+      "loss": 0.381,
+      "step": 5500
+    },
+    {
+      "epoch": 2.303664921465969,
+      "eval_loss": 0.33975139260292053,
+      "eval_runtime": 264.665,
+      "eval_samples_per_second": 32.075,
+      "eval_steps_per_second": 4.013,
+      "step": 5500
+    },
+    {
+      "epoch": 2.324607329842932,
+      "grad_norm": 1.6925002336502075,
+      "learning_rate": 4.0883333333333335e-05,
+      "loss": 0.372,
+      "step": 5550
+    },
+    {
+      "epoch": 2.345549738219895,
+      "grad_norm": 1.2782198190689087,
+      "learning_rate": 4.0050000000000004e-05,
+      "loss": 0.3705,
+      "step": 5600
+    },
+    {
+      "epoch": 2.3664921465968587,
+      "grad_norm": 1.279897689819336,
+      "learning_rate": 3.921666666666667e-05,
+      "loss": 0.3739,
+      "step": 5650
+    },
+    {
+      "epoch": 2.387434554973822,
+      "grad_norm": 2.082263231277466,
+      "learning_rate": 3.8383333333333336e-05,
+      "loss": 0.3696,
+      "step": 5700
+    },
+    {
+      "epoch": 2.4083769633507854,
+      "grad_norm": 1.2950184345245361,
+      "learning_rate": 3.7550000000000005e-05,
+      "loss": 0.3729,
+      "step": 5750
+    },
+    {
+      "epoch": 2.4293193717277486,
+      "grad_norm": 2.0218422412872314,
+      "learning_rate": 3.671666666666667e-05,
+      "loss": 0.374,
+      "step": 5800
+    },
+    {
+      "epoch": 2.450261780104712,
+      "grad_norm": 1.9062658548355103,
+      "learning_rate": 3.5883333333333336e-05,
+      "loss": 0.3701,
+      "step": 5850
+    },
+    {
+      "epoch": 2.4712041884816753,
+      "grad_norm": 1.6323776245117188,
+      "learning_rate": 3.505e-05,
+      "loss": 0.3683,
+      "step": 5900
+    },
+    {
+      "epoch": 2.492146596858639,
+      "grad_norm": 1.4120862483978271,
+      "learning_rate": 3.421666666666667e-05,
+      "loss": 0.3674,
+      "step": 5950
+    },
+    {
+      "epoch": 2.513089005235602,
+      "grad_norm": 2.368260383605957,
+      "learning_rate": 3.3383333333333336e-05,
+      "loss": 0.372,
+      "step": 6000
+    },
+    {
+      "epoch": 2.513089005235602,
+      "eval_loss": 0.3372337222099304,
+      "eval_runtime": 267.436,
+      "eval_samples_per_second": 31.742,
+      "eval_steps_per_second": 3.971,
+      "step": 6000
+    },
+    {
+      "epoch": 2.5340314136125652,
+      "grad_norm": 1.5971177816390991,
+      "learning_rate": 3.2550000000000005e-05,
+      "loss": 0.3665,
+      "step": 6050
+    },
+    {
+      "epoch": 2.554973821989529,
+      "grad_norm": 1.4700311422348022,
+      "learning_rate": 3.171666666666667e-05,
+      "loss": 0.3681,
+      "step": 6100
+    },
+    {
+      "epoch": 2.5759162303664924,
+      "grad_norm": 1.6570911407470703,
+      "learning_rate": 3.0883333333333336e-05,
+      "loss": 0.3641,
+      "step": 6150
+    },
+    {
+      "epoch": 2.5968586387434556,
+      "grad_norm": 1.9697566032409668,
+      "learning_rate": 3.0050000000000002e-05,
+      "loss": 0.3736,
+      "step": 6200
+    },
+    {
+      "epoch": 2.6178010471204187,
+      "grad_norm": 3.0906572341918945,
+      "learning_rate": 2.921666666666667e-05,
+      "loss": 0.3606,
+      "step": 6250
+    },
+    {
+      "epoch": 2.6387434554973823,
+      "grad_norm": 1.426192283630371,
+      "learning_rate": 2.8383333333333333e-05,
+      "loss": 0.3746,
+      "step": 6300
+    },
+    {
+      "epoch": 2.6596858638743455,
+      "grad_norm": 1.8590399026870728,
+      "learning_rate": 2.7550000000000002e-05,
+      "loss": 0.3675,
+      "step": 6350
+    },
+    {
+      "epoch": 2.680628272251309,
+      "grad_norm": 1.3725066184997559,
+      "learning_rate": 2.6716666666666668e-05,
+      "loss": 0.3669,
+      "step": 6400
+    },
+    {
+      "epoch": 2.701570680628272,
+      "grad_norm": 1.1929622888565063,
+      "learning_rate": 2.5883333333333337e-05,
+      "loss": 0.363,
+      "step": 6450
+    },
+    {
+      "epoch": 2.7225130890052354,
+      "grad_norm": 1.2838363647460938,
+      "learning_rate": 2.5050000000000002e-05,
+      "loss": 0.3642,
+      "step": 6500
+    },
+    {
+      "epoch": 2.7225130890052354,
+      "eval_loss": 0.3314475417137146,
+      "eval_runtime": 271.3873,
+      "eval_samples_per_second": 31.28,
+      "eval_steps_per_second": 3.913,
+      "step": 6500
+    },
+    {
+      "epoch": 2.743455497382199,
+      "grad_norm": 1.5131884813308716,
+      "learning_rate": 2.4216666666666668e-05,
+      "loss": 0.3631,
+      "step": 6550
+    },
+    {
+      "epoch": 2.7643979057591626,
+      "grad_norm": 2.1872663497924805,
+      "learning_rate": 2.3383333333333334e-05,
+      "loss": 0.3611,
+      "step": 6600
+    },
+    {
+      "epoch": 2.7853403141361257,
+      "grad_norm": 1.3774406909942627,
+      "learning_rate": 2.2550000000000003e-05,
+      "loss": 0.3674,
+      "step": 6650
+    },
+    {
+      "epoch": 2.806282722513089,
+      "grad_norm": 1.6061275005340576,
+      "learning_rate": 2.1716666666666668e-05,
+      "loss": 0.3654,
+      "step": 6700
+    },
+    {
+      "epoch": 2.8272251308900525,
+      "grad_norm": 1.909685492515564,
+      "learning_rate": 2.0883333333333334e-05,
+      "loss": 0.3611,
+      "step": 6750
+    },
+    {
+      "epoch": 2.8481675392670156,
+      "grad_norm": 1.6721019744873047,
+      "learning_rate": 2.0050000000000003e-05,
+      "loss": 0.3663,
+      "step": 6800
+    },
+    {
+      "epoch": 2.869109947643979,
+      "grad_norm": 1.6825881004333496,
+      "learning_rate": 1.921666666666667e-05,
+      "loss": 0.3638,
+      "step": 6850
+    },
+    {
+      "epoch": 2.8900523560209423,
+      "grad_norm": 1.6210919618606567,
+      "learning_rate": 1.8383333333333334e-05,
+      "loss": 0.3643,
+      "step": 6900
+    },
+    {
+      "epoch": 2.9109947643979055,
+      "grad_norm": 1.5710177421569824,
+      "learning_rate": 1.755e-05,
+      "loss": 0.3657,
+      "step": 6950
+    },
+    {
+      "epoch": 2.931937172774869,
+      "grad_norm": 1.158400535583496,
+      "learning_rate": 1.6716666666666665e-05,
+      "loss": 0.3692,
+      "step": 7000
+    },
+    {
+      "epoch": 2.931937172774869,
+      "eval_loss": 0.3307726979255676,
+      "eval_runtime": 273.8683,
+      "eval_samples_per_second": 30.997,
+      "eval_steps_per_second": 3.878,
+      "step": 7000
+    },
+    {
+      "epoch": 2.9528795811518327,
+      "grad_norm": 1.70427668094635,
+      "learning_rate": 1.5883333333333334e-05,
+      "loss": 0.3604,
+      "step": 7050
+    },
+    {
+      "epoch": 2.973821989528796,
+      "grad_norm": 1.7018851041793823,
+      "learning_rate": 1.505e-05,
+      "loss": 0.3603,
+      "step": 7100
+    },
+    {
+      "epoch": 2.994764397905759,
+      "grad_norm": 1.4028674364089966,
+      "learning_rate": 1.4216666666666667e-05,
+      "loss": 0.3729,
+      "step": 7150
+    },
+    {
+      "epoch": 3.0157068062827226,
+      "grad_norm": 1.1933259963989258,
+      "learning_rate": 1.3383333333333335e-05,
+      "loss": 0.3626,
+      "step": 7200
+    },
+    {
+      "epoch": 3.0366492146596857,
+      "grad_norm": 1.202587366104126,
+      "learning_rate": 1.255e-05,
+      "loss": 0.3565,
+      "step": 7250
+    },
+    {
+      "epoch": 3.0575916230366493,
+      "grad_norm": 1.1894952058792114,
+      "learning_rate": 1.1716666666666667e-05,
+      "loss": 0.3615,
+      "step": 7300
+    },
+    {
+      "epoch": 3.0785340314136125,
+      "grad_norm": 1.1371740102767944,
+      "learning_rate": 1.0883333333333335e-05,
+      "loss": 0.3578,
+      "step": 7350
+    },
+    {
+      "epoch": 3.099476439790576,
+      "grad_norm": 1.69568932056427,
+      "learning_rate": 1.005e-05,
+      "loss": 0.3593,
+      "step": 7400
+    },
+    {
+      "epoch": 3.1204188481675392,
+      "grad_norm": 1.6758700609207153,
+      "learning_rate": 9.216666666666666e-06,
+      "loss": 0.3562,
+      "step": 7450
+    },
+    {
+      "epoch": 3.141361256544503,
+      "grad_norm": 1.5202099084854126,
+      "learning_rate": 8.383333333333333e-06,
+      "loss": 0.3555,
+      "step": 7500
+    },
+    {
+      "epoch": 3.141361256544503,
+      "eval_loss": 0.32830387353897095,
+      "eval_runtime": 278.2777,
+      "eval_samples_per_second": 30.506,
+      "eval_steps_per_second": 3.816,
+      "step": 7500
+    },
+    {
+      "epoch": 3.162303664921466,
+      "grad_norm": 1.6989357471466064,
+      "learning_rate": 7.55e-06,
+      "loss": 0.3589,
+      "step": 7550
+    },
+    {
+      "epoch": 3.183246073298429,
+      "grad_norm": 1.243001103401184,
+      "learning_rate": 6.716666666666667e-06,
+      "loss": 0.3607,
+      "step": 7600
+    },
+    {
+      "epoch": 3.2041884816753927,
+      "grad_norm": 1.2895828485488892,
+      "learning_rate": 5.8833333333333335e-06,
+      "loss": 0.3555,
+      "step": 7650
+    },
+    {
+      "epoch": 3.225130890052356,
+      "grad_norm": 1.1663577556610107,
+      "learning_rate": 5.050000000000001e-06,
+      "loss": 0.3571,
+      "step": 7700
+    },
+    {
+      "epoch": 3.2460732984293195,
+      "grad_norm": 1.4683984518051147,
+      "learning_rate": 4.216666666666666e-06,
+      "loss": 0.3553,
+      "step": 7750
+    },
+    {
+      "epoch": 3.2670157068062826,
+      "grad_norm": 0.9420146942138672,
+      "learning_rate": 3.3833333333333337e-06,
+      "loss": 0.3566,
+      "step": 7800
+    },
+    {
+      "epoch": 3.287958115183246,
+      "grad_norm": 1.1002775430679321,
+      "learning_rate": 2.55e-06,
+      "loss": 0.3635,
+      "step": 7850
+    },
+    {
+      "epoch": 3.3089005235602094,
+      "grad_norm": 0.9635823965072632,
+      "learning_rate": 1.7166666666666668e-06,
+      "loss": 0.3593,
+      "step": 7900
+    },
+    {
+      "epoch": 3.329842931937173,
+      "grad_norm": 1.0634921789169312,
+      "learning_rate": 8.833333333333334e-07,
+      "loss": 0.3564,
+      "step": 7950
+    },
+    {
+      "epoch": 3.350785340314136,
+      "grad_norm": 1.5584676265716553,
+      "learning_rate": 5.0000000000000004e-08,
+      "loss": 0.3536,
+      "step": 8000
+    },
+    {
+      "epoch": 3.350785340314136,
+      "eval_loss": 0.3261738121509552,
+      "eval_runtime": 275.3237,
+      "eval_samples_per_second": 30.833,
+      "eval_steps_per_second": 3.857,
+      "step": 8000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 8000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.5804068604023104e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-8000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f8ba8e957961bfac2918c828bd8fb1b1b5848bc9b1402a9fd5cd5021a38c66f
+size 5304