Saving weights and logs of step 16000

Browse files

Files changed (9) hide show

Determine_batch_size.ipynb +289 -0
Load_preprocessed_dataset.ipynb +147 -0
flax_model.msgpack +1 -1
pytorch_model.bin +1 -1
run_t5.sh +4 -4
run_t5_mlm_flax_custom_dataset.py +80 -70
runs/Jul10_12-03-45_t1v-n-0e7426e8-w-0/events.out.tfevents.1625920526.t1v-n-0e7426e8-w-0.48005.3.v2 +0 -3
runs/Jul10_12-39-58_t1v-n-0e7426e8-w-0/events.out.tfevents.1625922498.t1v-n-0e7426e8-w-0.52901.3.v2 +2 -2
runs/{Jul10_07-37-20_t1v-n-0e7426e8-w-0/events.out.tfevents.1625902752.t1v-n-0e7426e8-w-0.18397.3.v2 → Jul11_09-15-07_t1v-n-0e7426e8-w-0/events.out.tfevents.1625995853.t1v-n-0e7426e8-w-0.145718.3.v2} +2 -2

Determine_batch_size.ipynb ADDED Viewed

	@@ -0,0 +1,289 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "kqOqTZuKeJoa",
+    "outputId": "9f63819c-9bc1-4c15-e9cd-9c1121edd2a6"
+   },
+   "outputs": [],
+   "source": [
+    "#!pip install \"jax[tpu]>=0.2.16\" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "SQ-lhEVFeY4d",
+    "outputId": "7346c6b8-1848-4755-c114-94d6de50b50d"
+   },
+   "outputs": [],
+   "source": [
+    "#!git clone https://github.com/huggingface/transformers.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "9qSTMLvFfBVs",
+    "outputId": "40659f61-86d4-4ae5-9262-501557737705"
+   },
+   "outputs": [],
+   "source": [
+    "#!pip install ./transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "7Og7zRTrfm08"
+   },
+   "outputs": [],
+   "source": [
+    "#!pip install jaxlib>=0.2.9"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "id": "nmQv7VMaf1L8"
+   },
+   "outputs": [],
+   "source": [
+    "#!pip install flax>=0.3.4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "MT6jpop-f4dc"
+   },
+   "outputs": [],
+   "source": [
+    "#!pip install optax>=0.0.9"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%capture\n",
+    "# !pip install jupyterlab_widgets\n",
+    "# !pip install ipywidgets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "id": "-F5NIqDmfDLb"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-07-08 10:11:37.310929: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import (\n",
+    "    CONFIG_MAPPING,\n",
+    "    FLAX_MODEL_FOR_MASKED_LM_MAPPING,\n",
+    "    BatchEncoding,\n",
+    "    FlaxT5ForConditionalGeneration,\n",
+    "    T5ForConditionalGeneration,\n",
+    "    HfArgumentParser,\n",
+    "    PreTrainedTokenizerBase,\n",
+    "    T5Config,\n",
+    "    T5TokenizerFast,\n",
+    "    TrainingArguments,\n",
+    "    is_tensorboard_available,\n",
+    "    set_seed,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "id": "aInICxY6gREQ"
+   },
+   "outputs": [],
+   "source": [
+    "import flax\n",
+    "import jax\n",
+    "import jax.numpy as jnp\n",
+    "import optax\n",
+    "from flax import jax_utils, traverse_util\n",
+    "from flax.training import train_state\n",
+    "from flax.training.common_utils import get_metrics, onehot, shard\n",
+    "from transformers import (\n",
+    "    CONFIG_MAPPING,\n",
+    "    FLAX_MODEL_FOR_MASKED_LM_MAPPING,\n",
+    "    BatchEncoding,\n",
+    "    FlaxT5ForConditionalGeneration,\n",
+    "    T5ForConditionalGeneration,\n",
+    "    HfArgumentParser,\n",
+    "    PreTrainedTokenizerBase,\n",
+    "    T5Config,\n",
+    "    T5TokenizerFast,\n",
+    "    TrainingArguments,\n",
+    "    is_tensorboard_available,\n",
+    "    set_seed,\n",
+    ")\n",
+    "from transformers.models.t5.modeling_flax_t5 import shift_tokens_right\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "id": "iEqVlHptfOCT"
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer = T5TokenizerFast.from_pretrained(\"t5-small\")\n",
+    "config = T5Config.from_pretrained(\"t5-small\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "LNETw3cWfjbr",
+    "outputId": "95c0e750-c087-46dd-92fa-39f8ff0238f2"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:absl:Starting the local TPU driver.\n",
+      "INFO:absl:Unable to initialize backend 'tpu_driver': Not found: Unable to find driver in registry given worker: local://\n",
+      "INFO:absl:Unable to initialize backend 'gpu': Not found: Could not find registered platform with name: \"cuda\". Available platform names are: TPU Interpreter Host\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = FlaxT5ForConditionalGeneration(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "id": "T5F3BEA2f6xE"
+   },
+   "outputs": [],
+   "source": [
+    "input_ids = np.asarray(208 * [512 * [1]], dtype=np.int32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_forward(input_ids, params):\n",
+    "    return model(input_ids, decoder_input_ids=input_ids).logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "jitted_forward = jax.jit(run_forward)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logits = jitted_forward(input_ids, model.params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "accelerator": "TPU",
+  "colab": {
+   "name": "Untitled1.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

Load_preprocessed_dataset.ipynb ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "cf148030-7287-4c9e-ae32-8d1e1c47be30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset, DatasetDict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "5161b4ba-e8cf-43e1-b67e-503c29aa4271",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datasets = DatasetDict.load_from_disk(\"./grouped_dataset\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "15f9d047-ac35-43d7-ab55-9f9afe96dd07",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['input_ids'],\n",
+       "        num_rows: 86438919\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['input_ids'],\n",
+       "        num_rows: 4735324\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "d1d1218e-142e-441a-b20d-d300b13b172a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = datasets['train']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9eaddfb1-242f-4a25-8789-efe97b2a5712",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "8aabb26f-19ca-467a-b383-3a693be70cac",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "86438919\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(train))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3176986-5b34-4ed6-a643-e342db9a2ce8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1205bbef-ba9d-4ddc-af2e-602d56b7dd64",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'input_ids': [256, 3, 20, 18452, 6690, 7757, 1286, 43, 10, 4942, 1286, 80, 12, 4782, 5442, 39, 5385, 33, 4, 5, 3, 2924, 117, 5669, 228, 21, 193, 9030, 511, 24, 11, 5, 665, 165, 4218, 7, 26, 264, 1528, 35, 105, 3, 19653, 12, 9661, 17156, 13955, 4, 132, 5, 611, 959, 961, 146, 6522, 7757, 1286, 89, 7500, 9716, 11, 5, 4868, 107, 13604, 12, 12836, 13368, 11, 611, 959, 4, 3, 69, 99, 12, 13132, 6690, 590, 5, 1803, 1867, 69, 7, 924, 10, 1762, 4, 3, 69, 538, 489, 14, 1149, 16, 3, 11384, 199, 116, 399, 4782, 291, 3, 6, 237, 13, 2629, 3, 8987, 291, 4, 69, 5, 3, 27, 72, 20, 325, 3, 2924, 133, 21, 105, 9030, 10, 1149, 242, 16, 144, 13572, 11, 9, 13401, 20, 7951, 8, 165, 4218, 4, 5, 1910]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "it = iter(train)\n",
+    "\n",
+    "print(next(it))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5d4e8de-419c-4c70-896e-fbd640bb7321",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

flax_model.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:68189daaf4d05f88ba3305865c8d542080aac997060714ef57b8b116f56010c0
 size 891548548

 version https://git-lfs.github.com/spec/v1
+oid sha256:ed68bf4bf2ba245a90ae31d71a56c0f85d1ef7665f0748bd10826c688e5de825
 size 891548548

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d4162e34e834b3cf52825caa4f4d2cbec358c35405a212052af6c977b2561680
 size 891650495

 version https://git-lfs.github.com/spec/v1
+oid sha256:1a8f60fdc3ad43a82bab7ec3dcaf1138179d7508798267becb15426d86b9385f
 size 891650495

run_t5.sh CHANGED Viewed

@@ -6,7 +6,6 @@ mkdir -p "${MODEL_DIR}/runs"
 # T5 paper lr 0.01 with batch size 128
 # We have a batch size of 8 devices * 32 = 256, so lr = 0.01/2
-# Warmup steps is set to 4% of the training steps
 ./run_t5_mlm_flax_custom_dataset.py \
     --output_dir="${MODEL_DIR}" \
@@ -23,12 +22,13 @@ mkdir -p "${MODEL_DIR}/runs"
     --dtype="bfloat16" \
     --overwrite_output_dir \
     --num_train_epochs="1" \
-    --logging_steps="50" \
     --save_steps="2000" \
-    --eval_steps="1000000" \
     --push_to_hub
-#    --resume_from_checkpoint="${MODEL_DIR}/ckpt-1500" \
 #git add pytorch_model.bin

 # T5 paper lr 0.01 with batch size 128
 # We have a batch size of 8 devices * 32 = 256, so lr = 0.01/2
 ./run_t5_mlm_flax_custom_dataset.py \
     --output_dir="${MODEL_DIR}" \
     --dtype="bfloat16" \
     --overwrite_output_dir \
     --num_train_epochs="1" \
+    --logging_steps="20" \
     --save_steps="2000" \
+    --eval_steps="10000000" \
+    --resume_from_checkpoint="${MODEL_DIR}/ckpt-14000" \
+    --warmup_steps="3413" \
     --push_to_hub
 #git add pytorch_model.bin

run_t5_mlm_flax_custom_dataset.py CHANGED Viewed

@@ -31,7 +31,7 @@ from pathlib import Path
 from typing import Dict, List, Optional
 import numpy as np
-from datasets import load_dataset
 from tqdm import tqdm
 import flax
@@ -552,15 +552,15 @@ if __name__ == "__main__":
             add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*54*.gz")
             add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*68*.gz")
             add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*57*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*46*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*35*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*13*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*41*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*52*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*63*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*85*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*81*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*96*.gz")
             add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
             add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
             random.Random(SEED).shuffle(data_files)
@@ -580,7 +580,10 @@ if __name__ == "__main__":
         train, val = train_val_files()
-        datasets = load_dataset('json', data_files={'train': train, 'validation': val})
         # data_files = {}
         # if data_args.train_file is not None:
@@ -623,31 +626,8 @@ if __name__ == "__main__":
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    if training_args.do_train:
-        column_names = datasets["train"].column_names
-    else:
-        column_names = datasets["validation"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
-    # Since we make sure that all sequences are of the same length, no attention_mask is needed.
-    def tokenize_function(examples):
-        return tokenizer(examples[text_column_name], return_attention_mask=False)
-    logger.info(f"Start tokenization, remove_column_names = {column_names}")
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        remove_columns=column_names,
-        load_from_cache_file=not data_args.overwrite_cache,
-    )
     # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
     # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
     # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.
@@ -656,40 +636,64 @@ if __name__ == "__main__":
         noise_density=data_args.mlm_probability,
         mean_noise_span_length=data_args.mean_noise_span_length,
     )
-    logger.info(f"Expanded_inputs_length: {expanded_inputs_length}, targets_length: {targets_length}")
-    logger.info(f"Start group_texts")
-    # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
-    def group_texts(examples):
-        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
-        total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-        # customize this part to your needs.
-        if total_length >= expanded_inputs_length:
-            total_length = (total_length // expanded_inputs_length) * expanded_inputs_length
-        # Split by chunks of max_len.
-        result = {
-            k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)]
-            for k, t in concatenated_examples.items()
-        }
-        return result
-    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
-    # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
-    # might be slower to preprocess.
-    #
-    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-    tokenized_datasets = tokenized_datasets.map(
-        group_texts,
-        batched=True,
-        batch_size=200,
-        num_proc=data_args.preprocessing_num_workers,
-        load_from_cache_file=not data_args.overwrite_cache,
-    )
     # Enable tensorboard only on the master node
     has_tensorboard = is_tensorboard_available()
@@ -751,9 +755,14 @@ if __name__ == "__main__":
     # Create learning rate schedule
-    # See https://arxiv.org/pdf/2104.07705.pdf for rationale of choosing the peak at 4% of training steps
-    warmup_steps = int(0.04 * num_train_steps)
-    logging.info(f"Warmup steps set to 4% = {warmup_steps} of total train steps {num_train_steps}")
     warmup_fn = optax.linear_schedule(
         init_value=0.0, end_value=training_args.learning_rate, transition_steps=warmup_steps
@@ -863,7 +872,8 @@ if __name__ == "__main__":
     state = jax_utils.replicate(state)
     logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(datasets['train'])}")
     logger.info(f"  Num tokenized group examples {len(tokenized_datasets['train'])}")
     logger.info(f"  Num Epochs = {num_epochs}")
     logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")

 from typing import Dict, List, Optional
 import numpy as np
+from datasets import load_dataset, DatasetDict
 from tqdm import tqdm
 import flax
             add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*54*.gz")
             add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*68*.gz")
             add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*57*.gz")
+            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*46*.gz")
+            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*35*.gz")
+            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*13*.gz")
+            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*41*.gz")
+            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*52*.gz")
+            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*63*.gz")
+            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*85*.gz")
+            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*81*.gz")
+            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*96*.gz")
             add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
             add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
             random.Random(SEED).shuffle(data_files)
         train, val = train_val_files()
+        load_grouped = False
+        if not load_grouped:
+            datasets = load_dataset('json', data_files={'train': train, 'validation': val})
         # data_files = {}
         # if data_args.train_file is not None:
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
     # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
     # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
     # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.
         noise_density=data_args.mlm_probability,
         mean_noise_span_length=data_args.mean_noise_span_length,
     )
+    logger.info(f"Max seq length: {max_seq_length}, expanded_inputs_length: {expanded_inputs_length}, targets_length: {targets_length}")
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if not load_grouped:
+        if training_args.do_train:
+            column_names = datasets["train"].column_names
+        else:
+            column_names = datasets["validation"].column_names
+        text_column_name = "text" if "text" in column_names else column_names[0]
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        # Since we make sure that all sequences are of the same length, no attention_mask is needed.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name], return_attention_mask=False)
+        logger.info(f"Start tokenization, remove_column_names = {column_names}")
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            if total_length >= expanded_inputs_length:
+                total_length = (total_length // expanded_inputs_length) * expanded_inputs_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        logger.info(f"Start group_texts")
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            batch_size=200,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    else:
+        logger.info("Loading tokenized and grouped dataset")
+        tokenized_datasets = DatasetDict.load_from_disk("/home/yeb/grouped_datasets")
     # Enable tensorboard only on the master node
     has_tensorboard = is_tensorboard_available()
     # Create learning rate schedule
+    if training_args.warmup_steps:
+        warmup_steps = training_args.warmup_steps
+    elif training_args.warmup_ratio:
+        # See https://arxiv.org/pdf/2104.07705.pdf for rationale of choosing the peak at % of training steps
+        warmup_steps = int(training_args.warmup_ratio * num_train_steps)
+        logging.info(f"Warmup steps set to {100*training_args.warmup_ratio}% = {warmup_steps} of total train steps {num_train_steps}")
+    else:
+        raise Exception("Need either --warmup_steps or --warmup_ratio")
     warmup_fn = optax.linear_schedule(
         init_value=0.0, end_value=training_args.learning_rate, transition_steps=warmup_steps
     state = jax_utils.replicate(state)
     logger.info("***** Running training *****")
+    if not load_grouped:
+        logger.info(f"  Num examples = {len(datasets['train'])}")
     logger.info(f"  Num tokenized group examples {len(tokenized_datasets['train'])}")
     logger.info(f"  Num Epochs = {num_epochs}")
     logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")

runs/Jul10_12-03-45_t1v-n-0e7426e8-w-0/events.out.tfevents.1625920526.t1v-n-0e7426e8-w-0.48005.3.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac6e43e7a7661df39374a0364897650b948e816446aa7cfb526ff2f0f51b9e1e
-size 40

runs/Jul10_12-39-58_t1v-n-0e7426e8-w-0/events.out.tfevents.1625922498.t1v-n-0e7426e8-w-0.52901.3.v2 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d9f295ae46710e76c6932be7c90ac6db5f1f58dbce55b34870ab3d43248fdaee
-size 2077245

 version https://git-lfs.github.com/spec/v1
+oid sha256:62ea16c934f55451bfb14bd666567f0c8837fead4ad2e1f6a8adbd8d11fd25a6
+size 2359167

runs/{Jul10_07-37-20_t1v-n-0e7426e8-w-0/events.out.tfevents.1625902752.t1v-n-0e7426e8-w-0.18397.3.v2 → Jul11_09-15-07_t1v-n-0e7426e8-w-0/events.out.tfevents.1625995853.t1v-n-0e7426e8-w-0.145718.3.v2} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1aa4fd14ba6d0007ac2b4c7ad5f7b03ab486b3899ece3eba1fefe852923f2366
-size 40

 version https://git-lfs.github.com/spec/v1
+oid sha256:ecdd317adb51d2b44773888aaa52793f97b5af475a8f35560774d02bd6ae20a2
+size 300940