Replace scripts and model with improved version

Files changed (14) hide show

Load_preprocessed_dataset.ipynb +0 -165
Load_token_group_dataset.ipynb +0 -567
README.md +30 -12
config.json +1 -1
flax_model.msgpack +1 -1
flax_to_pt.py +26 -6
opt_state.msgpack +0 -3
pytorch_model.bin +1 -1
run_t5.sh +37 -79
run_t5_mlm_flax_custom_dataset.py → run_t5_mlm_flax.py +213 -246
streaming_dataset_filter_test.py +0 -93
tf_model.h5 +2 -2
train_tokenizer.py +0 -66
training_state.json +0 -1

Load_preprocessed_dataset.ipynb DELETED Viewed

@@ -1,165 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "cf148030-7287-4c9e-ae32-8d1e1c47be30",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import Dataset, DatasetDict"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "5161b4ba-e8cf-43e1-b67e-503c29aa4271",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: '/home/yeb/grouped_dataset/dataset_dict.json'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[0;32m/tmp/ipykernel_574434/3668239933.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdatasets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDatasetDict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_disk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/yeb/grouped_dataset\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m~/datasets/src/datasets/dataset_dict.py\u001b[0m in \u001b[0;36mload_from_disk\u001b[0;34m(dataset_dict_path, fs, keep_in_memory)\u001b[0m\n\u001b[1;32m    727\u001b[0m                 \u001b[0;34mf\"No such file or directory: '{dataset_dict_json_path}'. Expected to load a DatasetDict object, but got a Dataset. Please use datasets.load_from_disk instead.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    728\u001b[0m             )\n\u001b[0;32m--> 729\u001b[0;31m         \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset_dict_json_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"splits\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    730\u001b[0m             dataset_dict_split_path = (\n\u001b[1;32m    731\u001b[0m                 \u001b[0mdataset_dict_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"://\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"://\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdest_dataset_dict_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_posix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/spec.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, path, mode, block_size, cache_options, **kwargs)\u001b[0m\n\u001b[1;32m    956\u001b[0m             }\n\u001b[1;32m    957\u001b[0m             return io.TextIOWrapper(\n\u001b[0;32m--> 958\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblock_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mtext_kwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    959\u001b[0m             )\n\u001b[1;32m    960\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/spec.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, path, mode, block_size, cache_options, **kwargs)\u001b[0m\n\u001b[1;32m    960\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    961\u001b[0m             \u001b[0mac\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"autocommit\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_intrans\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 962\u001b[0;31m             f = self._open(\n\u001b[0m\u001b[1;32m    963\u001b[0m                 \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    964\u001b[0m                 \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/implementations/local.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self, path, mode, block_size, **kwargs)\u001b[0m\n\u001b[1;32m    142\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_mkdir\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\"w\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    143\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 144\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mLocalFileOpener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    145\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    146\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mtouch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/implementations/local.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, mode, autocommit, fs, compression, **kwargs)\u001b[0m\n\u001b[1;32m    233\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompression\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_compression\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcompression\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    234\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblocksize\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEFAULT_BUFFER_SIZE\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    237\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/implementations/local.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    238\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclosed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    239\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautocommit\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m\"w\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    241\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompression\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    242\u001b[0m                     \u001b[0mcompress\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompression\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/yeb/grouped_dataset/dataset_dict.json'"
-     ]
-    }
-   ],
-   "source": [
-    "datasets = DatasetDict.load_from_disk(\"/home/yeb/grouped_dataset\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "15f9d047-ac35-43d7-ab55-9f9afe96dd07",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "DatasetDict({\n",
-       "    train: Dataset({\n",
-       "        features: ['input_ids'],\n",
-       "        num_rows: 86438919\n",
-       "    })\n",
-       "    validation: Dataset({\n",
-       "        features: ['input_ids'],\n",
-       "        num_rows: 4735324\n",
-       "    })\n",
-       "})"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "datasets"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "d1d1218e-142e-441a-b20d-d300b13b172a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train = datasets['train']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9eaddfb1-242f-4a25-8789-efe97b2a5712",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "8aabb26f-19ca-467a-b383-3a693be70cac",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "86438919\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(len(train))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f3176986-5b34-4ed6-a643-e342db9a2ce8",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "1205bbef-ba9d-4ddc-af2e-602d56b7dd64",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'input_ids': [256, 3, 20, 18452, 6690, 7757, 1286, 43, 10, 4942, 1286, 80, 12, 4782, 5442, 39, 5385, 33, 4, 5, 3, 2924, 117, 5669, 228, 21, 193, 9030, 511, 24, 11, 5, 665, 165, 4218, 7, 26, 264, 1528, 35, 105, 3, 19653, 12, 9661, 17156, 13955, 4, 132, 5, 611, 959, 961, 146, 6522, 7757, 1286, 89, 7500, 9716, 11, 5, 4868, 107, 13604, 12, 12836, 13368, 11, 611, 959, 4, 3, 69, 99, 12, 13132, 6690, 590, 5, 1803, 1867, 69, 7, 924, 10, 1762, 4, 3, 69, 538, 489, 14, 1149, 16, 3, 11384, 199, 116, 399, 4782, 291, 3, 6, 237, 13, 2629, 3, 8987, 291, 4, 69, 5, 3, 27, 72, 20, 325, 3, 2924, 133, 21, 105, 9030, 10, 1149, 242, 16, 144, 13572, 11, 9, 13401, 20, 7951, 8, 165, 4218, 4, 5, 1910]}\n"
-     ]
-    }
-   ],
-   "source": [
-    "it = iter(train)\n",
-    "\n",
-    "print(next(it))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f5d4e8de-419c-4c70-896e-fbd640bb7321",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

Load_token_group_dataset.ipynb DELETED Viewed

@@ -1,567 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 71,
-   "id": "d7f2bdb5-95c2-4a57-80e8-8f1a30a138b0",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of files 20 after adding ./c4_cleaned glob *73*.gz\n",
-      "Number of files 39 after adding ./c4_cleaned glob *47*.gz\n",
-      "Number of files 60 after adding ./c4_cleaned glob *12*.gz\n",
-      "Number of files 79 after adding ./c4_cleaned glob *29*.gz\n",
-      "Number of files 97 after adding ./c4_cleaned glob *74*.gz\n",
-      "Number of files 116 after adding ./c4_cleaned glob *26*.gz\n",
-      "Number of files 135 after adding ./c4_cleaned glob *54*.gz\n",
-      "Number of files 154 after adding ./c4_cleaned glob *68*.gz\n",
-      "Number of files 172 after adding ./c4_cleaned glob *57*.gz\n",
-      "Number of files 189 after adding ./c4_cleaned glob *46*.gz\n",
-      "Number of files 206 after adding ./c4_cleaned glob *35*.gz\n",
-      "Number of files 226 after adding ./c4_cleaned glob *13*.gz\n",
-      "Number of files 242 after adding ./c4_cleaned glob *41*.gz\n",
-      "Number of files 259 after adding ./c4_cleaned glob *52*.gz\n",
-      "Number of files 276 after adding ./c4_cleaned glob *63*.gz\n",
-      "Number of files 292 after adding ./c4_cleaned glob *85*.gz\n",
-      "Number of files 309 after adding ./c4_cleaned glob *81*.gz\n",
-      "Number of files 326 after adding ./c4_cleaned glob *96*.gz\n",
-      "Number of files 526 after adding ./nrc_uniq_cleaned_20210223 glob *.gz\n",
-      "Number of files 726 after adding ./nu_uniq_cleaned_20210225 glob *.gz\n",
-      "726\n",
-      "Got 690 training files and 5.0 % 36 validation files\n"
-     ]
-    }
-   ],
-   "source": [
-    "data_files = []\n",
-    "data_dir=\".\"\n",
-    "def train_val_files():\n",
-    "    import glob\n",
-    "    import random\n",
-    "    SEED = 12345\n",
-    "\n",
-    "    def add_jsonlines_dir(path, filespec):\n",
-    "        global data_files\n",
-    "        data_files += glob.glob(f\"{path}/{filespec}\")\n",
-    "        data_files = list(set(data_files))\n",
-    "        print(f\"Number of files {len(data_files)} after adding {path} glob {filespec}\")\n",
-    "\n",
-    "    # add_jsonlines_dir(f\"{data_dir}/oscar_nl_cleaned\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*73*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*47*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*12*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*29*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*74*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*26*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*54*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*68*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*57*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*46*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*35*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*13*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*41*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*52*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*63*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*85*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*81*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*96*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/nrc_uniq_cleaned_20210223\", \"*.gz\")\n",
-    "    add_jsonlines_dir(f\"{data_dir}/nu_uniq_cleaned_20210225\", \"*.gz\")\n",
-    "    random.Random(SEED).shuffle(data_files)\n",
-    "\n",
-    "    total = len(data_files)\n",
-    "    print(total)\n",
-    "    perc = 0.05\n",
-    "    val_size = int(perc * total)\n",
-    "    train_size = total - val_size\n",
-    "    train = data_files[:train_size]\n",
-    "    val = data_files[train_size:]\n",
-    "    print(f\"Got {len(train)} training files and {perc*100} % {len(val)} validation files\")\n",
-    "\n",
-    "    assert list(set(train) & set(val)) == [], \"Train overlaps with test\"\n",
-    "\n",
-    "    return train, val\n",
-    "\n",
-    "train, val = train_val_files()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 72,
-   "id": "66a923c6-1c7e-4ac2-9aec-e75c572104dd",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using custom data configuration default-ce92ec7dc3732df4\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/yeb/.cache/huggingface/datasets/json/default-ce92ec7dc3732df4/0.0.0/793d004298099bd3c4e61eb7878475bcf1dc212bf2e34437d85126758720d7f9...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "0 tables [00:00, ? tables/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "0 tables [00:00, ? tables/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset json downloaded and prepared to /home/yeb/.cache/huggingface/datasets/json/default-ce92ec7dc3732df4/0.0.0/793d004298099bd3c4e61eb7878475bcf1dc212bf2e34437d85126758720d7f9. Subsequent calls will reuse this data.\n"
-     ]
-    }
-   ],
-   "source": [
-    "from datasets import load_dataset\n",
-    "datasets = load_dataset('json', data_files={'train': train, 'validation': val})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 73,
-   "id": "4a6d6009-00e7-4b30-b577-6805dd849b8a",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Num examples = 21153916\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f\"Num examples = {len(datasets['train'])}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 74,
-   "id": "c6186d88-4296-4d1d-b7cd-d0196f0b0f97",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import (\n",
-    "    CONFIG_MAPPING,\n",
-    "    FLAX_MODEL_FOR_MASKED_LM_MAPPING,\n",
-    "    BatchEncoding,\n",
-    "    FlaxT5ForConditionalGeneration,\n",
-    "    T5ForConditionalGeneration,\n",
-    "    HfArgumentParser,\n",
-    "    PreTrainedTokenizerBase,\n",
-    "    T5Config,\n",
-    "    T5TokenizerFast,\n",
-    "    TrainingArguments,\n",
-    "    is_tensorboard_available,\n",
-    "    set_seed,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 75,
-   "id": "10d90997-6eb6-4399-b1a7-8a858ae4738c",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Start tokenization, remove_column_names = ['url', 'timestamp', 'text']\n"
-     ]
-    }
-   ],
-   "source": [
-    "tokenizer = T5TokenizerFast.from_pretrained(\"./t5-base-dutch\")\n",
-    "\n",
-    "def tokenize_function(examples):\n",
-    "    return tokenizer(examples['text'], return_attention_mask=False)\n",
-    "\n",
-    "column_names = datasets[\"train\"].column_names\n",
-    "print(f\"Start tokenization, remove_column_names = {column_names}\")\n",
-    "\n",
-    "tokenized_datasets = datasets.map(\n",
-    "    tokenize_function,\n",
-    "    batched=True,\n",
-    "    num_proc=96,\n",
-    "    remove_columns=column_names,\n",
-    "    load_from_cache_file=True,\n",
-    ")\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 76,
-   "id": "de7983e1-775d-4ee3-bf66-681f731501fb",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "21153916"
-      ]
-     },
-     "execution_count": 76,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(tokenized_datasets[\"train\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
-   "id": "5721ad35-8373-4999-8ac5-02c6f759373f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Expanded_inputs_length: 141, targets_length: 29\n",
-      "Start group_texts\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "https://symbolize.stripped_domain/r/?trace=https://symbolize.stripped_domain/r/?trace=503811,5cca55,7fe2dabc120f,7fe2dabc120f,90641f90b85f&map=&map= \n",
-      " \n",
-      "*** SIGTERM received by PID 47670 (TID 47670) on cpu 70 from PID 33223; stack trace: ***\n",
-      "*** SIGTERM received by PID 47686 (TID 47686) on cpu 71 from PID 33223; stack trace: ***\n",
-      "https://symbolize.stripped_domain/r/?trace=56a4e1,7fe2dabc120f&map= \n",
-      "https://symbolize.stripped_domain/r/?trace=*** SIGTERM received by PID 47673 (TID 47673) on cpu 16 from PID 33223; stack trace: ***\n",
-      "56a682,7fe2dabc120f,7fdfb4cf751f,90b3ff&map= \n",
-      "*** SIGTERM received by PID 47665 (TID 47665) on cpu 67 from PID 33223; stack trace: ***\n",
-      "PC: @           0x503811  (unknown)  (unknown)\n",
-      "PC: @           0x56a4e1  (unknown)  _PyEval_EvalFrameDefault\n",
-      "PC: @           0x5cca55  (unknown)  (unknown)\n",
-      "    @     0x7fde2703b800        976  (unknown)\n",
-      "    @     0x7fde2703b800        976  (unknown)\n",
-      "    @     0x7fe2dabc1210  (unknown)  (unknown)\n",
-      "    @ ... and at least 1 more frames\n",
-      "https://symbolize.stripped_domain/r/?trace=    @     0x7fe2dabc1210  852927808  (unknown)\n",
-      "56a4e1,7fde2703b7ff,7fe2dabc120f&map=2a762cd764e70bc90ae4c7f9747c08d7:7fde1a0f9000-7fde2737a280 \n",
-      "E0710 11:59:41.025238   47673 coredump_hook.cc:250] RAW: Remote crash gathering disabled for SIGTERM.\n",
-      "    @     0x7fde2703b800        976  (unknown)\n",
-      "    @     0x7fe2dabc1210  850855568  (unknown)\n",
-      "    @           0x90b860  (unknown)  (unknown)\n",
-      "https://symbolize.stripped_domain/r/?trace=5cca55,7fde2703b7ff,7fe2dabc120f,90b85f&map=2a762cd764e70bc90ae4c7f9747c08d7:7fde1a0f9000-7fde2737a280 \n",
-      "E0710 11:59:41.030755   47686 coredump_hook.cc:250] RAW: Remote crash gathering disabled for SIGTERM.\n",
-      "    @           0x906420  (unknown)  (unknown)\n",
-      "https://symbolize.stripped_domain/r/?trace=503811,7fde2703b7ff,7fe2dabc120f,90641f&map=2a762cd764e70bc90ae4c7f9747c08d7:7fde1a0f9000-7fde2737a280 \n",
-      "E0710 11:59:41.033184   47670 coredump_hook.cc:250] RAW: Remote crash gathering disabled for SIGTERM.\n",
-      "E0710 11:59:41.033730   47673 process_state.cc:771] RAW: Raising signal 15 with default behavior\n",
-      "PC: @           0x56a682  (unknown)  _PyEval_EvalFrameDefault\n",
-      "    @     0x7fde2703b800        976  (unknown)\n",
-      "    @     0x7fe2dabc1210  (unknown)  (unknown)\n",
-      "    @     0x7fdfb4cf7520  (unknown)  (unknown)\n",
-      "E0710 11:59:41.057700   47670 process_state.cc:771] RAW: Raising signal 15 with default behavior\n",
-      "E0710 11:59:41.063730   47686 process_state.cc:771] RAW: Raising signal 15 with default behavior\n",
-      "    @           0x90b400  (unknown)  (unknown)\n",
-      "https://symbolize.stripped_domain/r/?trace=56a682,7fde2703b7ff,7fe2dabc120f,7fdfb4cf751f,90b3ff&map=2a762cd764e70bc90ae4c7f9747c08d7:7fde1a0f9000-7fde2737a280 \n",
-      "E0710 11:59:41.064237   47665 coredump_hook.cc:250] RAW: Remote crash gathering disabled for SIGTERM.\n",
-      "E0710 11:59:41.091833   47665 process_state.cc:771] RAW: Raising signal 15 with default behavior\n"
-     ]
-    }
-   ],
-   "source": [
-    "def compute_input_and_target_lengths(inputs_length, noise_density, mean_noise_span_length):\n",
-    "    \"\"\"This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2466>`__ .\n",
-    "\n",
-    "    Training parameters to avoid padding with random_spans_noise_mask.\n",
-    "    When training a model with random_spans_noise_mask, we would like to set the other\n",
-    "    training hyperparmeters in a way that avoids padding.\n",
-    "    This function helps us compute these hyperparameters.\n",
-    "    We assume that each noise span in the input is replaced by extra_tokens_per_span_inputs sentinel tokens,\n",
-    "    and each non-noise span in the targets is replaced by extra_tokens_per_span_targets sentinel tokens.\n",
-    "    This function tells us the required number of tokens in the raw example (for split_tokens())\n",
-    "    as well as the length of the encoded targets. Note that this function assumes\n",
-    "    the inputs and targets will have EOS appended and includes that in the reported length.\n",
-    "\n",
-    "    Args:\n",
-    "        inputs_length: an integer - desired length of the tokenized inputs sequence\n",
-    "        noise_density: a float\n",
-    "        mean_noise_span_length: a float\n",
-    "    Returns:\n",
-    "        tokens_length: length of original text in tokens\n",
-    "        targets_length: an integer - length in tokens of encoded targets sequence\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def _tokens_length_to_inputs_length_targets_length(tokens_length):\n",
-    "        num_noise_tokens = int(round(tokens_length * noise_density))\n",
-    "        num_nonnoise_tokens = tokens_length - num_noise_tokens\n",
-    "        num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length))\n",
-    "        # inputs contain all nonnoise tokens, sentinels for all noise spans\n",
-    "        # and one EOS token.\n",
-    "        _input_length = num_nonnoise_tokens + num_noise_spans + 1\n",
-    "        _output_length = num_noise_tokens + num_noise_spans + 1\n",
-    "        return _input_length, _output_length\n",
-    "\n",
-    "    tokens_length = inputs_length\n",
-    "\n",
-    "    while _tokens_length_to_inputs_length_targets_length(tokens_length + 1)[0] <= inputs_length:\n",
-    "        tokens_length += 1\n",
-    "\n",
-    "    inputs_length, targets_length = _tokens_length_to_inputs_length_targets_length(tokens_length)\n",
-    "\n",
-    "    # minor hack to get the targets length to be equal to inputs length\n",
-    "    # which is more likely to have been set to a nice round number.\n",
-    "    if noise_density == 0.5 and targets_length > inputs_length:\n",
-    "        tokens_length -= 1\n",
-    "        targets_length -= 1\n",
-    "    return tokens_length, targets_length\n",
-    "\n",
-    "# T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.\n",
-    "# To ensure that the input length is `max_seq_length`, we need to increase the maximum length\n",
-    "# according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.\n",
-    "expanded_inputs_length, targets_length = compute_input_and_target_lengths(\n",
-    "    inputs_length=128,\n",
-    "    noise_density=0.15,\n",
-    "    mean_noise_span_length=3.0,\n",
-    ")\n",
-    "\n",
-    "print(f\"Expanded_inputs_length: {expanded_inputs_length}, targets_length: {targets_length}\")\n",
-    "print(f\"Start group_texts\")\n",
-    "\n",
-    "# Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.\n",
-    "def group_texts(examples):\n",
-    "    # Concatenate all texts.\n",
-    "    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
-    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
-    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
-    "    # customize this part to your needs.\n",
-    "    if total_length >= expanded_inputs_length:\n",
-    "        total_length = (total_length // expanded_inputs_length) * expanded_inputs_length\n",
-    "    # Split by chunks of max_len.\n",
-    "    result = {\n",
-    "        k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)]\n",
-    "        for k, t in concatenated_examples.items()\n",
-    "    }\n",
-    "    return result\n",
-    "\n",
-    "# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a\n",
-    "# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value\n",
-    "# might be slower to preprocess.\n",
-    "#\n",
-    "# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:\n",
-    "# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map\n",
-    "grouped_datasets = tokenized_datasets.map(\n",
-    "    group_texts,\n",
-    "    batched=True,\n",
-    "    batch_size=200,\n",
-    "    num_proc=96,\n",
-    "    load_from_cache_file=True,\n",
-    ")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 78,
-   "id": "f37e7559-fcc1-436b-a4ee-45adb856869e",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "86438919"
-      ]
-     },
-     "execution_count": 78,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "examples = len(grouped_datasets[\"train\"])\n",
-    "examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "id": "21aac2aa-9dc2-4b7a-8c46-62cfa47f18a7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "it = iter(grouped_datasets[\"train\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
-   "id": "011a6a07-5fe0-441a-b032-79cf8664b5c5",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'input_ids': [256, 3, 20, 18452, 6690, 7757, 1286, 43, 10, 4942, 1286, 80, 12, 4782, 5442, 39, 5385, 33, 4, 5, 3, 2924, 117, 5669, 228, 21, 193, 9030, 511, 24, 11, 5, 665, 165, 4218, 7, 26, 264, 1528, 35, 105, 3, 19653, 12, 9661, 17156, 13955, 4, 132, 5, 611, 959, 961, 146, 6522, 7757, 1286, 89, 7500, 9716, 11, 5, 4868, 107, 13604, 12, 12836, 13368, 11, 611, 959, 4, 3, 69, 99, 12, 13132, 6690, 590, 5, 1803, 1867, 69, 7, 924, 10, 1762, 4, 3, 69, 538, 489, 14, 1149, 16, 3, 11384, 199, 116, 399, 4782, 291, 3, 6, 237, 13, 2629, 3, 8987, 291, 4, 69, 5, 3, 27, 72, 20, 325, 3, 2924, 133, 21, 105, 9030, 10, 1149, 242, 16, 144, 13572, 11, 9, 13401, 20, 7951, 8, 165, 4218, 4, 5, 1910]}\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(next(it))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "id": "f20d3da2-0132-4ecc-b9b9-c2b5ec06f031",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tokens = next(it)['input_ids']\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "id": "2bad87cd-06e1-4c52-b2d6-d61fcb96e35d",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "141"
-      ]
-     },
-     "execution_count": 82,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(tokens)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 83,
-   "id": "4e0f573a-0abc-4f8f-b59a-a281fb306425",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "\"werden volgens getuigen vergezeld door een boomlange bodyguard. ook hing er een gordijntje om de tafel, zodat beyoncé in alle rust van de show kon genieten. volgens de bron verliet knowles pas om 03.30 uur's ochtends de hippe club.</s> utrecht - in de schouwburg van utrecht gaat vrijdagavond de musical 'joseph and the amazing technicolor dreamcoat' in première. voor het eerst in nederland. een voorloper van het geesteskind van andrew lloyd webber werd al in 1967 voor het eerst op een school in groot-brittannië uitgeprobeerd. twaalf jaar later werd het in\""
-      ]
-     },
-     "execution_count": 83,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "tokenizer.decode(tokens)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "id": "ab853c1b-0e0f-4ae8-b1cb-053f76a7d9d7",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[0;32m/tmp/ipykernel_33223/1050159500.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwhile\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mexample\u001b[0m \u001b[0;34m:=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexample\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'input_ids'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m141\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m         \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/datasets/src/datasets/arrow_dataset.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1264\u001b[0m         \u001b[0moutput_all_columns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output_all_columns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1265\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1266\u001b[0;31m             yield self._getitem(\n\u001b[0m\u001b[1;32m   1267\u001b[0m                 \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1268\u001b[0m                 \u001b[0mformat_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mformat_type\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/datasets/src/datasets/arrow_dataset.py\u001b[0m in \u001b[0;36m_getitem\u001b[0;34m(self, key, format_type, format_columns, output_all_columns, format_kwargs)\u001b[0m\n\u001b[1;32m   1507\u001b[0m         \u001b[0mformat_kwargs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat_kwargs\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mformat_kwargs\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1508\u001b[0m         \u001b[0mformatter\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_formatter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mformat_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mformat_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1509\u001b[0;31m         \u001b[0mpa_subtable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindices\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_indices\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_indices\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1510\u001b[0m         formatted_output = format_table(\n\u001b[1;32m   1511\u001b[0m             \u001b[0mpa_subtable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformatter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mformatter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat_columns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mformat_columns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_all_columns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_all_columns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/datasets/src/datasets/formatting/formatting.py\u001b[0m in \u001b[0;36mquery_table\u001b[0;34m(table, key, indices)\u001b[0m\n\u001b[1;32m    369\u001b[0m     \u001b[0;31m# Query the main table\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    370\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mindices\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 371\u001b[0;31m         \u001b[0mpa_subtable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_query_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    372\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    373\u001b[0m         \u001b[0mpa_subtable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_query_table_with_indices_mapping\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindices\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindices\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/datasets/src/datasets/formatting/formatting.py\u001b[0m in \u001b[0;36m_query_table\u001b[0;34m(table, key)\u001b[0m\n\u001b[1;32m     77\u001b[0m     \"\"\"\n\u001b[1;32m     78\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 79\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mtable\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfast_slice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mtable\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_rows\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     80\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     81\u001b[0m         \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindices\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/datasets/src/datasets/table.py\u001b[0m in \u001b[0;36mfast_slice\u001b[0;34m(self, offset, length)\u001b[0m\n\u001b[1;32m    127\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0moffset\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_offsets\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlength\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlength\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    128\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mpa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTable\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_batches\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mschema\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_schema\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 129\u001b[0;31m         \u001b[0mi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_interpolation_search\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_offsets\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    130\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlength\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mlength\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0moffset\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_offsets\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    131\u001b[0m             \u001b[0mbatches\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_batches\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/datasets/src/datasets/table.py\u001b[0m in \u001b[0;36m_interpolation_search\u001b[0;34m(arr, x)\u001b[0m\n\u001b[1;32m     84\u001b[0m     \u001b[0;32mwhile\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m         \u001b[0mk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mj\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m//\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m         \u001b[0;32mif\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     87\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "while (example := next(it, None)) is not None:\n",
-    "    if len(example['input_ids']) == 141:\n",
-    "        continue\n",
-    "    else:\n",
-    "        print(example)\n",
-    "        break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f71a0f6b-3b60-4dd5-a9af-0ef43aadc6a1",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

README.md CHANGED Viewed

@@ -3,21 +3,34 @@ language:
 - dutch
 tags:
 - seq2seq
-- text-generation
 datasets:
-- mc4
 ---
 # t5-base-dutch
-Created by [Yeb Havinga](https://www.linkedin.com/in/yeb-havinga-86530825/) & [Dat Nguyen](https://www.linkedin.com/in/dat-nguyen-49a641138/) during the [Hugging Face community week](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104), organized by [HuggingFace](https://huggingface.co/) and TPU usage sponsored by Google, for the project [Pre-train T5 from scratch in Dutch](https://discuss.huggingface.co/t/pretrain-t5-from-scratch-in-dutch/8109).
-See also the fine-tuned [t5-base-dutch-demo](https://huggingface.co/flax-community/t5-base-dutch-demo) model, and the demo application **[Netherformer 📰](https://huggingface.co/spaces/flax-community/netherformer)**, that are based on this model.
 ## Dataset
-This model was trained on a cleaned version of the Dutch part of [mC4](https://huggingface.co/datasets/mc4).
-See the `clean` directory for the clean script.
   * Documents that contained words from a selection of the Dutch and English [List of Dirty Naught Obscene and Otherwise Bad Words](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words) are removed
   * Sentences with less than 3 words are removed
@@ -26,13 +39,18 @@ See the `clean` directory for the clean script.
   * Documents with "javascript", "lorum ipsum", "terms of use", "privacy policy", "cookie policy", "uses cookies",
     "use of cookies", "use cookies", "elementen ontbreken", "deze printversie" are removed.
 ## Training
-Training of the model was resumed from an earlier checkpoint several times, as can be seen in the training metrics tab. (switch to wall time for a better view).
-After several hours of training an error would be raised that we haven't been able to identify and solve. As a workaround,
-the first few resumes would start again at step 0 with a different seeded reshuffling of the data.
-In the last two resumes the random seed was fixed, and training would resume at the previous step, since a try/except around the failing example would allow training to continue in the case of errors caused by a single example.
-The final model was trained for 63000 steps with a batch size of 128, ending with an evaluation loss of 1.79 and accuracy of 0.64.
-A triangle learning rate schedule was used, with peak learning rate 0.01 for the first few runs, and 0.001 for the last two runs.

 - dutch
 tags:
 - seq2seq
+- lm-head
 datasets:
+- yhavinga/mc4_nl_cleaned
+license: apache-2.0
+inference: false
 ---
 # t5-base-dutch
+Created by [Yeb Havinga](https://www.linkedin.com/in/yeb-havinga-86530825/)
+& [Dat Nguyen](https://www.linkedin.com/in/dat-nguyen-49a641138/) during the [Hugging Face community week](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104), organized by [HuggingFace](https://huggingface.co/) and TPU usage sponsored by Google, for the project [Pre-train T5 from scratch in Dutch](https://discuss.huggingface.co/t/pretrain-t5-from-scratch-in-dutch/8109).
+See also the fine-tuned [t5-base-dutch-demo](https://huggingface.co/flax-community/t5-base-dutch-demo) model,
+and the demo application **[Netherformer 📰](https://huggingface.co/spaces/flax-community/netherformer)**,
+that are based on this model.
+**5 jan 2022: Model updated. Evaluation accuracy increased from 0.64 to 0.70.**
+## Model
+* Configuration based on `google/t5-base`
+* 12 layers, 12 heads
+* Dropout set to 0.1
 ## Dataset
+This model was trained on the `full` configuration of [cleaned Dutch mC4](https://huggingface.co/datasets/mc4_nl_cleaned),
+which is the original mC4, except
   * Documents that contained words from a selection of the Dutch and English [List of Dirty Naught Obscene and Otherwise Bad Words](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words) are removed
   * Sentences with less than 3 words are removed
   * Documents with "javascript", "lorum ipsum", "terms of use", "privacy policy", "cookie policy", "uses cookies",
     "use of cookies", "use cookies", "elementen ontbreken", "deze printversie" are removed.
+## Tokenization
+A SentencePiece tokenizer was trained from scratch on this dataset.
+The total tokens of the `full` configuration is 34B
 ## Training
+The model was trained on the `full` mc4_nl_cleaned dataset configuration for 1 epoch, consisting of 34B tokens,
+for 528 482 steps with a batch size of 128 and took 57 hours.
+A triangle learning rate schedule was used, with peak learning rate 0.005.
+## Evaluation
+* Loss: 1.38
+* Accuracy: 0.70

config.json CHANGED Viewed

@@ -52,7 +52,7 @@
     }
   },
   "torch_dtype": "float32",
-  "transformers_version": "4.9.0.dev0",
   "use_cache": true,
   "vocab_size": 32103
 }

     }
   },
   "torch_dtype": "float32",
+  "transformers_version": "4.13.0",
   "use_cache": true,
   "vocab_size": 32103
 }

flax_model.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7530cff462d75db600d085d83bcc77ac48dde95d396cf714cf51f786dcddd7eb
 size 891548548

 version https://git-lfs.github.com/spec/v1
+oid sha256:be5973ac1f68ec3c5ceb47e10ed848b83ad06e69affa938fc400e3ef368143ea
 size 891548548

flax_to_pt.py CHANGED Viewed

@@ -1,6 +1,26 @@
-from transformers import T5ForConditionalGeneration, TFT5ForConditionalGeneration
-pt_model = T5ForConditionalGeneration.from_pretrained(".", from_flax=True)
-pt_model.save_pretrained(".")
-tf_model = TFT5ForConditionalGeneration.from_pretrained(".", from_pt=True)
-tf_model.save_pretrained(".")

+import torch
+import numpy as np
+import jax.numpy as jnp
+from transformers import AutoTokenizer
+from transformers import FlaxT5ForConditionalGeneration
+from transformers import T5ForConditionalGeneration
+tokenizer = AutoTokenizer.from_pretrained(".")
+model_fx = FlaxT5ForConditionalGeneration.from_pretrained(".")
+model_pt = T5ForConditionalGeneration.from_pretrained(".", from_flax=True)
+model_pt.save_pretrained("./")
+text = "Hoe gaat het?"
+e_input_ids_fx = tokenizer(text, return_tensors="np", padding=True, max_length=128, truncation=True)
+d_input_ids_fx = jnp.ones((e_input_ids_fx.input_ids.shape[0], 1), dtype="i4") * model_fx.config.decoder_start_token_id
+e_input_ids_pt = tokenizer(text, return_tensors="pt", padding=True, max_length=128, truncation=True)
+d_input_ids_pt = np.ones((e_input_ids_pt.input_ids.shape[0], 1), dtype="i4") * model_pt.config.decoder_start_token_id
+print(e_input_ids_fx)
+print(d_input_ids_fx)
+print()
+encoder_pt = model_fx.encode(**e_input_ids_pt)
+decoder_pt = model_fx.decode(d_input_ids_pt, encoder_pt)
+logits_pt = decoder_pt.logits
+print(logits_pt)
+encoder_fx = model_fx.encode(**e_input_ids_fx)
+decoder_fx = model_fx.decode(d_input_ids_fx, encoder_fx)
+logits_fx = decoder_fx.logits
+print(logits_fx)

opt_state.msgpack DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ffae8bd1730e35ebeb0619a7d1b75dab07addff2320d2394eb1af891820ca64f
-size 1985609

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d1b04c56abcc3a5bd4d7e871c7d017f44ab5b75af1c4adcc30c205da5fc5ede1
 size 891650495

 version https://git-lfs.github.com/spec/v1
+oid sha256:f102fac4815a8b1b29916b196bfe88a0e5fef76083c6007a5c7966a7fcb9b2d6
 size 891650495

run_t5.sh CHANGED Viewed

@@ -1,79 +1,37 @@
-MODEL="t5-base-dutch"
-MODEL_DIR="${HOME}/${MODEL}"
-mkdir -p "${MODEL_DIR}/runs"
-# T5 paper lr 0.01 with batch size 128
-# We have a batch size of 8 devices * 32 = 256, so lr = 0.01/2
-#SEED=9200
-#
-#./run_t5_mlm_flax_custom_dataset.py \
-#    --output_dir="${MODEL_DIR}" \
-#    --model_type="t5" \
-#    --config_name="flax-community/${MODEL}" \
-#    --tokenizer_name="${MODEL_DIR}" \
-#    --seed="${SEED}" \
-#    --preprocessing_num_workers="96" \
-#    --do_train --do_eval \
-#    --adafactor \
-#    --max_seq_length="512" \
-#    --per_device_train_batch_size="32" \
-#    --per_device_eval_batch_size="32" \
-#    --dtype="bfloat16" \
-#    --learning_rate="5e-3" \
-#    --overwrite_output_dir \
-#    --num_train_epochs="3" \
-#    --logging_steps="50" \
-#    --save_steps="100" \
-#    --eval_steps="5000" \
-#    --warmup_steps="3413"
-#exit
-while true; do
-  # Set the seed to random before each run, so date shuffling per epoch is different each run.
-  # This kills reproducibility, but is required as long as during training ValueError can be raised.
-#  SEED=$RANDOM
-  SEED=22384
-  ./run_t5_mlm_flax_custom_dataset.py \
-      --output_dir="${MODEL_DIR}" \
-      --model_type="t5" \
-      --config_name="flax-community/${MODEL}" \
-      --tokenizer_name="${MODEL_DIR}" \
-      --seed="${SEED}" \
-      --preprocessing_num_workers="96" \
-      --do_train --do_eval \
-      --adafactor \
-      --max_seq_length="512" \
-      --per_device_train_batch_size="16" \
-      --per_device_eval_batch_size="16" \
-      --dtype="bfloat16" \
-      --learning_rate="1e-3" \
-      --overwrite_output_dir \
-      --num_train_epochs="1" \
-      --logging_steps="50" \
-      --save_steps="500" \
-      --eval_steps="5000" \
-      --resume_from_checkpoint="${MODEL_DIR}" \
-      --warmup_steps="6519"
-#       \
-#      --push_to_hub
-  echo "RESTARTING"
-  sleep 20
-done
-#
-#     \
-#git add pytorch_model.bin
-#git commit -m "Update pytorch model after training"
-#git push origin main
-#    --gradient_accumulation_steps="2" \
-#    --resume_from_checkpoint="${MODEL_DIR}/ckpt-18000" \

+#!/bin/bash
+export HF_PROJECT="t5-base-dutch"
+# Variables for training the tokenizer and creating the config
+export VOCAB_SIZE="32000"
+export N_INPUT_SENTENCES="1000000" # Num of sentences to train the tokenizer
+export DATASET="yhavinga/mc4_nl_cleaned" # Name of the dataset in the Huggingface Hub
+export DATASET_CONFIG="full" # Config of the dataset in the Huggingface Hub
+export DATASET_SPLIT="train" # Split to use for training tokenizer and model
+export TEXT_FIELD="text" # Field containing the text to be used for training
+export CONFIG_TYPE="t5-base" # Config that our model will use
+export MODEL_PATH="${HOME}/data/${HF_PROJECT}" # Path to the model, e.g. here inside the mount
+python run_t5_mlm_flax.py \
+    --output_dir="${MODEL_PATH}" \
+    --model_type="t5" \
+    --config_name="${MODEL_PATH}" \
+    --tokenizer_name="${MODEL_PATH}" \
+    --preprocessing_num_workers="96" \
+    --do_train --do_eval \
+    --dataset_name="${DATASET}" \
+    --dataset_config_name="${DATASET_CONFIG}" \
+    --max_seq_length="512" \
+    --per_device_train_batch_size="16" \
+    --per_device_eval_batch_size="16" \
+    --adafactor \
+    --learning_rate="0.005" \
+    --overwrite_output_dir \
+    --num_train_epochs="1" \
+    --logging_steps="500" \
+    --save_steps="80000" \
+    --eval_steps="2500" \
+    --weight_decay="0.01" \
+    --warmup_steps="10000" \
+    --validation_split_count="15000" \
+    --push_to_hub

run_t5_mlm_flax_custom_dataset.py → run_t5_mlm_flax.py RENAMED Viewed

@@ -18,6 +18,8 @@ Pretraining the library models for T5-like span-masked language modeling on a te
 Here is the full list of checkpoints on the hub that can be pretrained by this script:
 https://huggingface.co/models?filter=t5
 """
 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 import logging
@@ -25,13 +27,13 @@ import os
 import sys
 import time
 import json
-import shutil
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Dict, List, Optional
 import numpy as np
-from datasets import load_dataset, DatasetDict
 from tqdm import tqdm
 import flax
@@ -39,34 +41,31 @@ import jax
 import jax.numpy as jnp
 import optax
 from flax import jax_utils, traverse_util
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
-from flax.serialization import to_bytes, from_bytes
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
     BatchEncoding,
     FlaxT5ForConditionalGeneration,
-    T5ForConditionalGeneration,
     HfArgumentParser,
     PreTrainedTokenizerBase,
     T5Config,
-    T5TokenizerFast,
     TrainingArguments,
     is_tensorboard_available,
     set_seed,
 )
 from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
 logger = logging.getLogger(__name__)
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-data_files = []
 @dataclass
 class ModelArguments:
     """
@@ -103,6 +102,12 @@ class ModelArguments:
             "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
         },
     )
 @dataclass
@@ -133,10 +138,10 @@ class DataTrainingArguments:
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
         metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
         },
     )
     max_seq_length: Optional[int] = field(
@@ -156,18 +161,31 @@ class DataTrainingArguments:
         default=3.0,
         metadata={"help": "Mean span length of masked tokens"},
     )
     def __post_init__(self):
-        return
-        # if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-        #     raise ValueError("Need either a dataset name or a training/validation file.")
-        # else:
-        #     if self.train_file is not None:
-        #         extension = self.train_file.split(".")[-1]
-        #         assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-        #     if self.validation_file is not None:
-        #         extension = self.validation_file.split(".")[-1]
-        #         assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 def compute_input_and_target_lengths(inputs_length, noise_density, mean_noise_span_length):
@@ -297,7 +315,7 @@ class FlaxDataCollatorForT5MLM:
         start_indices[:, 0] = mask_indices[:, 0]
         sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
-        sentinel_ids = np.where(sentinel_ids != 0, (sentinel_ids + self.tokenizer.vocab_size - 1), 0)
         sentinel_ids -= mask_indices - start_indices
         return sentinel_ids
@@ -362,7 +380,8 @@ class FlaxDataCollatorForT5MLM:
             np.random.shuffle(mask_indices)
             first_in_segment = np.pad(mask_indices, [[1, 0]])
             segment_id = np.cumsum(first_in_segment)
-            segment_length = np.asarray(jax.ops.segment_sum(np.ones_like(segment_id), segment_id))
             return segment_length
         noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
@@ -405,70 +424,40 @@ def write_eval_metric(summary_writer, eval_metrics, step):
     for metric_name, value in eval_metrics.items():
         summary_writer.scalar(f"eval_{metric_name}", value, step)
-# utils
 def mb_item(x):
     return x.item() if hasattr(x, "item") else x
-# checkpoint functions
-def save_checkpoint(model, save_dir, state, with_opt: bool = True):
     state = jax_utils.unreplicate(state)
-    logger.info(f"SAVING CHECKPOINT IN {save_dir}")
-    save_dir = f"{save_dir}/ckpt-{mb_item(state.step) - 1}"
-    model.save_pretrained(
-        save_dir,
-        params=state.params,
-        push_to_hub=False
-    )
     if with_opt:
         with open(os.path.join(save_dir, "opt_state.msgpack"), "wb") as f:
             f.write(to_bytes(state.opt_state))
         with open(os.path.join(save_dir, "training_state.json"), "w") as f:
             json.dump({"step": state.step.item()}, f)
-    logger.info(f"Updating model on the hub")
     model.save_pretrained(
-        training_args.output_dir,
         params=state.params,
-        push_to_hub=training_args.push_to_hub,
         commit_message=f"Saving weights and logs of step {cur_step}",
     )
-    if with_opt:
-        with open(os.path.join(training_args.output_dir, "opt_state.msgpack"), "wb") as f:
-            f.write(to_bytes(state.opt_state))
-        with open(os.path.join(training_args.output_dir, "training_state.json"), "w") as f:
-            json.dump({"step": state.step.item()}, f)
-    logger.info("checkpoint saved")
-def restore_checkpoint(save_dir, state):
-    logger.info(f"RESTORING CHECKPOINT FROM {save_dir}")
-    with open(os.path.join(save_dir, "flax_model.msgpack"), "rb") as f:
         params = from_bytes(state.params, f.read())
-    with open(os.path.join(save_dir, "opt_state.msgpack"), "rb") as f:
         opt_state = from_bytes(state.opt_state, f.read())
-    with open(os.path.join(save_dir, "training_state.json"), "r") as f:
         training_state = json.load(f)
     step = training_state["step"]
-    logger.info("checkpoint restored")
     return state.replace(step=step, params=params, opt_state=opt_state), step
-def rotate_checkpoints(ckpt_dir: str, save_total_limit: int):
-    "Removes older checkpoints so that `save_total_limit` checkpoints are kept"
-    # TODO: what to remove is decided using step number only, we might want to improve that
-    ckpts = [str(x) for x in Path(ckpt_dir).glob("ckpt-*")]
-    # sort checkpoints by step
-    ckpts_sorted = sorted(ckpts, key=lambda x: int(x.split('-')[-1]))
-    ckpts_to_delete = ckpts_sorted[:-save_total_limit]
-    for ckpt in ckpts_to_delete:
-        logger.info(f"Deleting older checkpoint [{ckpt}] due to save_total_limit ({save_total_limit})")
-        shutil.rmtree(ckpt)
 if __name__ == "__main__":
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
@@ -509,6 +498,16 @@ if __name__ == "__main__":
     # Set seed before initializing model.
     set_seed(training_args.seed)
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
     # (the dataset will be downloaded automatically from the datasets Hub).
@@ -523,82 +522,38 @@ if __name__ == "__main__":
             datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
                 cache_dir=model_args.cache_dir,
             )
             datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
                 cache_dir=model_args.cache_dir,
             )
     else:
-        data_dir = "/home/yeb"
-        # data_dir = "/home/yeb/Developer/data"
-        def train_val_files():
-            import glob
-            import random
-            SEED = 12345
-            def add_jsonlines_dir(path, filespec):
-                global data_files
-                data_files += glob.glob(f"{path}/{filespec}")
-                data_files = list(set(data_files))
-                print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
-            # add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*73*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*47*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*12*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*29*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*74*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*26*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*54*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*68*.gz")
-            add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*57*.gz")
-            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*46*.gz")
-            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*35*.gz")
-            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*13*.gz")
-            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*41*.gz")
-            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*52*.gz")
-            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*63*.gz")
-            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*85*.gz")
-            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*81*.gz")
-            # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*96*.gz")
-            add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
-            add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
-            random.Random(SEED).shuffle(data_files)
-            total = len(data_files)
-            print(total)
-            perc = 0.05
-            val_size = int(perc * total)
-            train_size = total - val_size
-            train = data_files[:train_size]
-            val = data_files[train_size:]
-            print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
-            assert list(set(train) & set(val)) == [], "Train overlaps with test"
-            return train, val
-        # train, val = train_val_files()
-        load_grouped = True
-        if not load_grouped:
-            datasets = load_dataset('json', data_files={'train': train, 'validation': val})
-        # data_files = {}
-        # if data_args.train_file is not None:
-        #     data_files["train"] = data_args.train_file
-        # if data_args.validation_file is not None:
-        #     data_files["validation"] = data_args.validation_file
-        # extension = data_args.train_file.split(".")[-1]
-        # if extension == "txt":
-        #     extension = "text"
-        # datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -606,12 +561,18 @@ if __name__ == "__main__":
     # Load pretrained model and tokenizer
     if model_args.tokenizer_name:
-        tokenizer = T5TokenizerFast.from_pretrained(
-            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
         )
     elif model_args.model_name_or_path:
-        tokenizer = T5TokenizerFast.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
         )
     else:
         raise ValueError(
@@ -631,8 +592,30 @@ if __name__ == "__main__":
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
     # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
     # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
     # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.
@@ -643,64 +626,36 @@ if __name__ == "__main__":
     )
     logger.info(f"Max seq length: {max_seq_length}, expanded_inputs_length: {expanded_inputs_length}, targets_length: {targets_length}")
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    if load_grouped:
-        logger.info("Loading tokenized and grouped dataset")
-        tokenized_datasets = DatasetDict.load_from_disk("/home/yeb/grouped_datasets")
-        logger.info("Setting max validation examples to 500")
-        tokenized_datasets['validation'] = tokenized_datasets['validation'].select(range(1000))
-    else:
-        if training_args.do_train:
-            column_names = datasets["train"].column_names
-        else:
-            column_names = datasets["validation"].column_names
-        text_column_name = "text" if "text" in column_names else column_names[0]
-        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
-        # Since we make sure that all sequences are of the same length, no attention_mask is needed.
-        def tokenize_function(examples):
-            return tokenizer(examples[text_column_name], return_attention_mask=False)
-        logger.info(f"Start tokenization, remove_column_names = {column_names}")
-        tokenized_datasets = datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-        # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
-        def group_texts(examples):
-            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
-            total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            if total_length >= expanded_inputs_length:
-                total_length = (total_length // expanded_inputs_length) * expanded_inputs_length
-            # Split by chunks of max_len.
-            result = {
-                k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)]
-                for k, t in concatenated_examples.items()
-            }
-            return result
-        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
-        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
-        # might be slower to preprocess.
-        #
-        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-        logger.info(f"Start group_texts")
-        tokenized_datasets = tokenized_datasets.map(
-            group_texts,
-            batched=True,
-            batch_size=200,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
     # Enable tensorboard only on the master node
     has_tensorboard = is_tensorboard_available()
@@ -729,15 +684,9 @@ if __name__ == "__main__":
             model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
     else:
         model = FlaxT5ForConditionalGeneration(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
-    # def to_bf16(t):
-    #     return jax.tree_map(lambda x: x.astype(jnp.bfloat16) if x.dtype == jnp.float32 else x, t)
-    #
-    #
-    # model.params = to_bf16(model.params)
     # Data collator
     # This one will take care of randomly masking the tokens.
     data_collator = FlaxDataCollatorForT5MLM(
@@ -752,16 +701,13 @@ if __name__ == "__main__":
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
-    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() * training_args.gradient_accumulation_steps
     eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
-    num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
     steps_per_epoch = len(tokenized_datasets['train']) // train_batch_size
-    total_train_steps = steps_per_epoch * num_epochs
     # Create learning rate schedule
     if training_args.warmup_steps:
         warmup_steps = training_args.warmup_steps
     elif training_args.warmup_ratio:
@@ -770,7 +716,6 @@ if __name__ == "__main__":
         logging.info(f"Warmup steps set to {100*training_args.warmup_ratio}% = {warmup_steps} of total train steps {num_train_steps}")
     else:
         raise Exception("Need either --warmup_steps or --warmup_ratio")
     warmup_fn = optax.linear_schedule(
         init_value=0.0, end_value=training_args.learning_rate, transition_steps=warmup_steps
     )
@@ -823,8 +768,6 @@ if __name__ == "__main__":
     else:
         resume_step = 0
-    logger.info("")
     # Define gradient update step fn
     def train_step(state, batch, dropout_rng):
         dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
@@ -845,7 +788,8 @@ if __name__ == "__main__":
         new_state = state.apply_gradients(grads=grad)
         metrics = jax.lax.pmean(
-            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step // grad_accum_steps)}, axis_name="batch"
         )
         return new_state, metrics, new_dropout_rng
@@ -875,17 +819,20 @@ if __name__ == "__main__":
     logger.info("Replicate the train state on each device")
     # Replicate the train state on each device
     state = jax_utils.replicate(state)
     logger.info("***** Running training *****")
-    if not load_grouped:
-        logger.info(f"  Num examples = {len(datasets['train'])}")
     logger.info(f"  Num tokenized group examples {len(tokenized_datasets['train'])}")
     logger.info(f"  Num Epochs = {num_epochs}")
     logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
     logger.info(f"  Total train batch size (w. parallel, distributed and grad_accum) = {train_batch_size}")
-    logger.info(f"  Total optimization steps = {total_train_steps}")
     train_time = 0
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
@@ -899,16 +846,26 @@ if __name__ == "__main__":
         # Generate an epoch by shuffling sampling indices from the train dataset
         num_train_samples = len(tokenized_datasets["train"])
-        train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
-        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size // grad_accum_steps)
         # Gather the indexes for creating the batch and do a training step
-        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
             cur_step = epoch * (num_train_samples // train_batch_size) + step
             # skip to the step from which we are resuming
             if cur_step < resume_step:
                 continue
             samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
             try:
                 model_inputs = data_collator(samples)
@@ -922,7 +879,6 @@ if __name__ == "__main__":
             state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
             train_metrics.append(train_metric)
             if cur_step % training_args.logging_steps * grad_accum_steps == 0 and cur_step > 0:
                 # Save metrics
                 train_metric = jax_utils.unreplicate(train_metric)
@@ -931,7 +887,7 @@ if __name__ == "__main__":
                     write_train_metric(summary_writer, train_metrics, train_time, cur_step)
                 epochs.write(
-                    f"Step... ({cur_step} ({cur_step+resume_step}| Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
                 )
                 train_metrics = []
@@ -961,39 +917,50 @@ if __name__ == "__main__":
                 # Save metrics
                 if has_tensorboard and jax.process_index() == 0:
-                    cur_step = epoch * (len(tokenized_datasets["train"]) // train_batch_size)
                     write_eval_metric(summary_writer, eval_metrics, cur_step)
             if cur_step % training_args.save_steps * grad_accum_steps == 0 and cur_step > 0:
-                logger.info(f"We should save the model here after {cur_step} steps")
                 # save checkpoint after each epoch and push checkpoint to the hub
                 if jax.process_index() == 0:
-                    save_checkpoint(model, training_args.output_dir, state)
-                    if training_args.save_total_limit is not None:
-                        rotate_checkpoints(training_args.output_dir, training_args.save_total_limit)
                     # params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-                    #
-                    # logger.info(f"Saving model after {cur_step} steps")
-                    # model.save_pretrained(
-                    #     training_args.output_dir,
-                    #     params=params,
-                    #     push_to_hub=training_args.push_to_hub,
-                    #     commit_message=f"Saving weights and logs of step {cur_step}",
-                    # )
-        # Save model at end
         if jax.process_index() == 0:
-            save_checkpoint(model, training_args.output_dir, state, with_opt=False)
-            # params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-            # logger.info(f"Saving model at end")
-            # model.save_pretrained(
-            #     training_args.output_dir,
-            #     params=params,
-            #     push_to_hub=training_args.push_to_hub,
-            #     commit_message=f"Saving weights and logs at end of run (step {cur_step})",
-            # )
-            # pt_model = T5ForConditionalGeneration.from_pretrained(training_args.output_dir, from_flax=True)
-            # pt_model.save_pretrained(training_args.output_dir,
-            #                          params=params)

 Here is the full list of checkpoints on the hub that can be pretrained by this script:
 https://huggingface.co/models?filter=t5
+Adapted from the original version to support gradient accumulation and restarting.
 """
 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 import logging
 import sys
 import time
 import json
 from dataclasses import dataclass, field
+from itertools import chain
 from pathlib import Path
 from typing import Dict, List, Optional
 import numpy as np
+from datasets import load_dataset
 from tqdm import tqdm
 import flax
 import jax.numpy as jnp
 import optax
 from flax import jax_utils, traverse_util
+from flax.serialization import to_bytes, from_bytes
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
+# from huggingface_hub import Repository
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoTokenizer,
     BatchEncoding,
     FlaxT5ForConditionalGeneration,
     HfArgumentParser,
     PreTrainedTokenizerBase,
     T5Config,
     TrainingArguments,
     is_tensorboard_available,
     set_seed,
 )
+# from transformers.file_utils import get_full_repo_name
 from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
 logger = logging.getLogger(__name__)
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 @dataclass
 class ModelArguments:
     """
             "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
         },
     )
+    auth_token: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Auth token for private repositories on the Huggingface Hub"
+        }
+    )
 @dataclass
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
+    validation_split_count: Optional[int] = field(
+        default=10000,
         metadata={
+            "help": "The count of the train set used as validation set in case there's no validation split"
         },
     )
     max_seq_length: Optional[int] = field(
         default=3.0,
         metadata={"help": "Mean span length of masked tokens"},
     )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
     def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 def compute_input_and_target_lengths(inputs_length, noise_density, mean_noise_span_length):
         start_indices[:, 0] = mask_indices[:, 0]
         sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
+        sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0)
         sentinel_ids -= mask_indices - start_indices
         return sentinel_ids
             np.random.shuffle(mask_indices)
             first_in_segment = np.pad(mask_indices, [[1, 0]])
             segment_id = np.cumsum(first_in_segment)
+            # count length of sub segments assuming that list is sorted
+            _, segment_length = np.unique(segment_id, return_counts=True)
             return segment_length
         noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
     for metric_name, value in eval_metrics.items():
         summary_writer.scalar(f"eval_{metric_name}", value, step)
 def mb_item(x):
     return x.item() if hasattr(x, "item") else x
+def save_checkpoint(model, save_dir, state, cur_step: int, with_opt: bool = True, push_to_hub: bool = False):
     state = jax_utils.unreplicate(state)
     if with_opt:
+        logger.info(f'Saving optimizer and training state in {save_dir}...')
         with open(os.path.join(save_dir, "opt_state.msgpack"), "wb") as f:
             f.write(to_bytes(state.opt_state))
         with open(os.path.join(save_dir, "training_state.json"), "w") as f:
             json.dump({"step": state.step.item()}, f)
+    logger.info(f'Saving model in {save_dir} {"and pushing it to HF Hub" if push_to_hub else ""}')
     model.save_pretrained(
+        save_dir,
         params=state.params,
+        push_to_hub=push_to_hub,
         commit_message=f"Saving weights and logs of step {cur_step}",
     )
+def restore_checkpoint(load_dir, state):
+    logger.info(f"Restoring checkpoint from {load_dir}")
+    with open(os.path.join(load_dir, "flax_model.msgpack"), "rb") as f:
         params = from_bytes(state.params, f.read())
+    with open(os.path.join(load_dir, "opt_state.msgpack"), "rb") as f:
         opt_state = from_bytes(state.opt_state, f.read())
+    with open(os.path.join(load_dir, "training_state.json"), "r") as f:
         training_state = json.load(f)
     step = training_state["step"]
+    logger.info(f"Checkpoint restored at step {step}")
     return state.replace(step=step, params=params, opt_state=opt_state), step
 if __name__ == "__main__":
     # See all possible arguments in src/transformers/training_args.py
     # or by passing the --help flag to this script.
     # Set seed before initializing model.
     set_seed(training_args.seed)
+    # Handle the repository creation
+    # if training_args.push_to_hub:
+    #     if training_args.hub_model_id is None:
+    #         repo_name = get_full_repo_name(
+    #             Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+    #         )
+    #     else:
+    #         repo_name = training_args.hub_model_id
+    #     repo = Repository(training_args.output_dir, clone_from=repo_name)
     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
     # (the dataset will be downloaded automatically from the datasets Hub).
             datasets["validation"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_count}]",
                 cache_dir=model_args.cache_dir,
             )
             datasets["train"] = load_dataset(
                 data_args.dataset_name,
                 data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_count}:]",
+                cache_dir=model_args.cache_dir,
+            )
+        else:
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"validation[:{data_args.validation_split_count}]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split="train",
                 cache_dir=model_args.cache_dir,
             )
     else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
     # Load pretrained model and tokenizer
     if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            use_auth_token=model_args.auth_token
         )
     elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=model_args.cache_dir,
+            use_fast=model_args.use_fast_tokenizer,
+            use_auth_token=model_args.auth_token
         )
     else:
         raise ValueError(
         config = CONFIG_MAPPING[model_args.model_type]()
         logger.warning("You are instantiating a new config instance from scratch.")
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    else:
+        column_names = datasets["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+    # Since we make sure that all sequences are of the same length, no attention_mask is needed.
+    def tokenize_function(examples):
+        return tokenizer(examples[text_column_name], return_attention_mask=False)
+    logger.info(f"Start tokenization, remove_column_names = {column_names}")
+    tokenized_datasets = datasets.map(
+        tokenize_function,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not data_args.overwrite_cache,
+    )
     # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
     # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
     # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.
     )
     logger.info(f"Max seq length: {max_seq_length}, expanded_inputs_length: {expanded_inputs_length}, targets_length: {targets_length}")
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= expanded_inputs_length:
+            total_length = (total_length // expanded_inputs_length) * expanded_inputs_length
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+    # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+    # might be slower to preprocess.
+    #
+    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    logger.info(f"Start group_texts")
+    tokenized_datasets = tokenized_datasets.map(
+        group_texts,
+        batched=True,
+        batch_size=200,
+        num_proc=data_args.preprocessing_num_workers,
+        load_from_cache_file=not data_args.overwrite_cache,
+    )
     # Enable tensorboard only on the master node
     has_tensorboard = is_tensorboard_available()
             model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
     else:
+        config.vocab_size = len(tokenizer)
         model = FlaxT5ForConditionalGeneration(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
     # Data collator
     # This one will take care of randomly masking the tokens.
     data_collator = FlaxDataCollatorForT5MLM(
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
     eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
     steps_per_epoch = len(tokenized_datasets['train']) // train_batch_size
+    num_train_steps = steps_per_epoch * num_epochs
     # Create learning rate schedule
     if training_args.warmup_steps:
         warmup_steps = training_args.warmup_steps
     elif training_args.warmup_ratio:
         logging.info(f"Warmup steps set to {100*training_args.warmup_ratio}% = {warmup_steps} of total train steps {num_train_steps}")
     else:
         raise Exception("Need either --warmup_steps or --warmup_ratio")
     warmup_fn = optax.linear_schedule(
         init_value=0.0, end_value=training_args.learning_rate, transition_steps=warmup_steps
     )
     else:
         resume_step = 0
     # Define gradient update step fn
     def train_step(state, batch, dropout_rng):
         dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
         new_state = state.apply_gradients(grads=grad)
         metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step // grad_accum_steps)},
+            axis_name="batch"
         )
         return new_state, metrics, new_dropout_rng
     logger.info("Replicate the train state on each device")
+    # import pydevd_pycharm
+    #
+    # pydevd_pycharm.settrace('localhost', port=12345, stdoutToServer=True, stderrToServer=True)
     # Replicate the train state on each device
     state = jax_utils.replicate(state)
     logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(datasets['train'])}")
     logger.info(f"  Num tokenized group examples {len(tokenized_datasets['train'])}")
     logger.info(f"  Num Epochs = {num_epochs}")
     logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
     logger.info(f"  Total train batch size (w. parallel, distributed and grad_accum) = {train_batch_size}")
+    logger.info(f"  Total optimization steps = {num_train_steps}")
     train_time = 0
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
         # Generate an epoch by shuffling sampling indices from the train dataset
         num_train_samples = len(tokenized_datasets["train"])
+        # train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        # train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
+        ## IF THE DATASET IS TOO LONG, WE ONLY PROCEED SEQUENTIALLY WITHOUT SHUFFLING
+        samples_to_remove = num_train_samples % (train_batch_size // grad_accum_steps)
+        samples_idx = np.arange(num_train_samples)
+        if samples_to_remove != 0:
+            samples_idx = samples_idx[:-samples_to_remove]
+        steps = num_train_samples // (train_batch_size // grad_accum_steps)
         # Gather the indexes for creating the batch and do a training step
+        # for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+        # samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
+        for step in tqdm(range(steps), desc="Training...", position=1):
             cur_step = epoch * (num_train_samples // train_batch_size) + step
             # skip to the step from which we are resuming
             if cur_step < resume_step:
                 continue
+            batch_idx = [x for x in range(step * train_batch_size, (step + 1) * train_batch_size)]
             samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
             try:
                 model_inputs = data_collator(samples)
             state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
             train_metrics.append(train_metric)
             if cur_step % training_args.logging_steps * grad_accum_steps == 0 and cur_step > 0:
                 # Save metrics
                 train_metric = jax_utils.unreplicate(train_metric)
                     write_train_metric(summary_writer, train_metrics, train_time, cur_step)
                 epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
                 )
                 train_metrics = []
                 # Save metrics
                 if has_tensorboard and jax.process_index() == 0:
                     write_eval_metric(summary_writer, eval_metrics, cur_step)
             if cur_step % training_args.save_steps * grad_accum_steps == 0 and cur_step > 0:
                 # save checkpoint after each epoch and push checkpoint to the hub
                 if jax.process_index() == 0:
                     # params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    # model.save_pretrained(training_args.output_dir, params=params)
+                    # tokenizer.save_pretrained(training_args.output_dir)
+                    # if training_args.push_to_hub:
+                    #     repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
+                    save_checkpoint(model, training_args.output_dir, state, cur_step, with_opt=False, push_to_hub=True)
+    # Eval after training
+    if training_args.do_eval:
+        num_eval_samples = len(tokenized_datasets["validation"])
+        eval_samples_idx = jnp.arange(num_eval_samples)
+        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+        eval_metrics = []
+        for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+            samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples)
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            metrics = p_eval_step(state.params, model_inputs)
+            eval_metrics.append(metrics)
+        # get eval metrics
+        eval_metrics = get_metrics(eval_metrics)
+        eval_metrics = jax.tree_map(lambda metric: jnp.mean(metric).item(), eval_metrics)
         if jax.process_index() == 0:
+            eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()}
+            path = os.path.join(training_args.output_dir, "eval_results.json")
+            with open(path, "w") as f:
+                json.dump(eval_metrics, f, indent=4, sort_keys=True)
+    # Save model at end
+    if jax.process_index() == 0:
+        # params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+        # model.save_pretrained(training_args.output_dir, params=params)
+        # tokenizer.save_pretrained(training_args.output_dir)
+        # if training_args.push_to_hub:
+        #     repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
+        #
+        save_checkpoint(model, training_args.output_dir, state, cur_step, with_opt=False, push_to_hub=True)

streaming_dataset_filter_test.py DELETED Viewed

@@ -1,93 +0,0 @@
-from clean import clean_text
-from datasets import load_dataset
-dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl", split='train', streaming=True)
-# data_dir = "/home/yeb"
-data_dir = "/home/yeb/Developer/data"
-data_files = []
-def train_val_files():
-    import glob
-    import random
-    SEED = 12345
-    def add_jsonlines_dir(path, filespec):
-        global data_files
-        data_files += glob.glob(f"{path}/{filespec}")
-        data_files = list(set(data_files))
-        print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
-    # add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
-    add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*73*.gz")
-#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*47*.gz")
-#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*12*.gz")
-#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*29*.gz")
-#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*74*.gz")
-#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*26*.gz")
-#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*54*.gz")
-#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*68*.gz")
-#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*57*.gz")
-    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*46*.gz")
-    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*35*.gz")
-    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*13*.gz")
-    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*41*.gz")
-    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*52*.gz")
-    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*63*.gz")
-    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*85*.gz")
-    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*81*.gz")
-    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*96*.gz")
-    # add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
-#     add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
-    random.Random(SEED).shuffle(data_files)
-    total = len(data_files)
-    print(total)
-    perc = 0.05
-    val_size = int(perc * total)
-    train_size = total - val_size
-    train = data_files[:train_size]
-    val = data_files[train_size:]
-    print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
-    assert list(set(train) & set(val)) == [], "Train overlaps with test"
-    return train, val
-train, val = train_val_files()
-dataset_v0 = load_dataset('json', data_files={'train': train, 'validation': val})
-dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl")
-def f(obj):
-    obj["text"] = clean_text(obj["text"])
-    return obj
-dataset_v1 = dataset_v0.map(
-    f,
-    batched=False,
-    num_proc=10,
-)
-datasets = dataset_v1.filter(
-    lambda obj: obj['text'] is not None,
-    num_proc=10,
-)
-it = iter(dataset_v0['train'])
-print(next(it))
-print(next(it))
-print(next(it))
-it = iter(dataset_v1['train'])
-print(next(it))
-print(next(it))
-print(next(it))
-# it = iter(dataset_v2)
-# print(next(it))
-# print(next(it))
-# print(next(it))

tf_model.h5 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ca091f719f88d0c460cb709fead1521082e46ac9b1d9873a06e65bb0ca2d94c
-size 892067416

 version https://git-lfs.github.com/spec/v1
+oid sha256:3083c65d23d0521977a9739022c8e48f3ee1094d43317b150cf044f1451cfd9c
+size 892068248

train_tokenizer.py DELETED Viewed

@@ -1,66 +0,0 @@
-from datasets import load_dataset
-from t5_tokenizer_model import SentencePieceUnigramTokenizer
-# from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
-data_dir = "/home/yeb"
-data_files = []
-def train_val_files():
-    import glob
-    import random
-    SEED = 12345
-    def add_jsonlines_dir(path, filespec):
-        global data_files
-        data_files += glob.glob(f"{path}/{filespec}")
-        print(f"Number of files {len(data_files)} after adding {path}")
-    # add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
-    add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*47*.gz")
-    add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
-    add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
-    random.Random(SEED).shuffle(data_files)
-    print(data_files)
-    total = len(data_files)
-    print(total)
-    perc = 0.01
-    val_size = int(perc * total)
-    train_size = total - val_size
-    train = data_files[:train_size]
-    val = data_files[train_size:]
-    print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
-    assert list(set(train) & set(val)) == [], "Train overlaps with test"
-    return train, val
-train, val = train_val_files()
-dataset = load_dataset('json', data_files={'train': train, 'validation': val}, split='train')
-vocab_size = 32000
-input_sentence_size = None
-tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
-# Build an iterator over this dataset
-def batch_iterator(input_sentence_size=None):
-    if input_sentence_size is None:
-        input_sentence_size = len(dataset)
-    batch_length = 100
-    for i in range(0, input_sentence_size, batch_length):
-        yield dataset[i: i + batch_length]["text"]
-# Train tokenizer
-tokenizer.train_from_iterator(
-    iterator=batch_iterator(input_sentence_size=input_sentence_size),
-    vocab_size=vocab_size,
-    show_progress=True,
-)
-# Save files to disk
-tokenizer.save("./tokenizer.json")

training_state.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"step": 62500}