Update weights and scripts
Browse files- Load_preprocessed_dataset.ipynb +22 -4
- flax_model.msgpack +1 -1
- opt_state.msgpack +1 -1
- pytorch_model.bin +1 -1
- run_t5.sh +30 -6
- run_t5_mlm_flax_custom_dataset.py +8 -6
- runs/Jul11_12-53-41_t1v-n-0e7426e8-w-0/events.out.tfevents.1626008983.t1v-n-0e7426e8-w-0.161493.3.v2 +0 -3
- runs/Jul12_06-43-08_t1v-n-0e7426e8-w-0/events.out.tfevents.1626072193.t1v-n-0e7426e8-w-0.238699.3.v2 +0 -3
- runs/Jul13_10-43-12_t1v-n-0e7426e8-w-0/events.out.tfevents.1626172997.t1v-n-0e7426e8-w-0.622440.3.v2 +2 -2
- runs/{Jul10_08-38-10_t1v-n-0e7426e8-w-0/events.out.tfevents.1625906314.t1v-n-0e7426e8-w-0.25839.3.v2 → Jul13_13-19-07_t1v-n-0e7426e8-w-0/events.out.tfevents.1626182353.t1v-n-0e7426e8-w-0.634946.3.v2} +2 -2
- runs/{Jul09_21-43-10_t1v-n-0e7426e8-w-0/events.out.tfevents.1625867209.t1v-n-0e7426e8-w-0.420316.3.v2 → Jul13_20-26-47_t1v-n-0e7426e8-w-0/events.out.tfevents.1626208012.t1v-n-0e7426e8-w-0.683072.3.v2} +2 -2
- runs/{Jul11_09-15-07_t1v-n-0e7426e8-w-0/events.out.tfevents.1625995853.t1v-n-0e7426e8-w-0.145718.3.v2 → Jul13_21-28-58_t1v-n-0e7426e8-w-0/events.out.tfevents.1626211744.t1v-n-0e7426e8-w-0.688568.3.v2} +2 -2
- runs/{Jul10_07-45-49_t1v-n-0e7426e8-w-0/events.out.tfevents.1625903173.t1v-n-0e7426e8-w-0.20563.3.v2 → Jul14_05-04-35_t1v-n-0e7426e8-w-0/events.out.tfevents.1626239081.t1v-n-0e7426e8-w-0.720177.3.v2} +2 -2
- runs/Jul14_05-21-55_t1v-n-0e7426e8-w-0/events.out.tfevents.1626240121.t1v-n-0e7426e8-w-0.722772.3.v2 +3 -0
- streaming_dataset_filter_test.py +72 -8
- training_state.json +1 -1
Load_preprocessed_dataset.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"id": "cf148030-7287-4c9e-ae32-8d1e1c47be30",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
@@ -12,12 +12,30 @@
|
|
12 |
},
|
13 |
{
|
14 |
"cell_type": "code",
|
15 |
-
"execution_count":
|
16 |
"id": "5161b4ba-e8cf-43e1-b67e-503c29aa4271",
|
17 |
"metadata": {},
|
18 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
"source": [
|
20 |
-
"datasets = DatasetDict.load_from_disk(\"
|
21 |
]
|
22 |
},
|
23 |
{
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 4,
|
6 |
"id": "cf148030-7287-4c9e-ae32-8d1e1c47be30",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
|
|
12 |
},
|
13 |
{
|
14 |
"cell_type": "code",
|
15 |
+
"execution_count": 7,
|
16 |
"id": "5161b4ba-e8cf-43e1-b67e-503c29aa4271",
|
17 |
"metadata": {},
|
18 |
+
"outputs": [
|
19 |
+
{
|
20 |
+
"ename": "FileNotFoundError",
|
21 |
+
"evalue": "[Errno 2] No such file or directory: '/home/yeb/grouped_dataset/dataset_dict.json'",
|
22 |
+
"output_type": "error",
|
23 |
+
"traceback": [
|
24 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
25 |
+
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
26 |
+
"\u001b[0;32m/tmp/ipykernel_574434/3668239933.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdatasets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDatasetDict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_disk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/yeb/grouped_dataset\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
27 |
+
"\u001b[0;32m~/datasets/src/datasets/dataset_dict.py\u001b[0m in \u001b[0;36mload_from_disk\u001b[0;34m(dataset_dict_path, fs, keep_in_memory)\u001b[0m\n\u001b[1;32m 727\u001b[0m \u001b[0;34mf\"No such file or directory: '{dataset_dict_json_path}'. Expected to load a DatasetDict object, but got a Dataset. Please use datasets.load_from_disk instead.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 728\u001b[0m )\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset_dict_json_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"splits\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m dataset_dict_split_path = (\n\u001b[1;32m 731\u001b[0m \u001b[0mdataset_dict_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"://\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"://\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdest_dataset_dict_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_posix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
28 |
+
"\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/spec.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, path, mode, block_size, cache_options, **kwargs)\u001b[0m\n\u001b[1;32m 956\u001b[0m }\n\u001b[1;32m 957\u001b[0m return io.TextIOWrapper(\n\u001b[0;32m--> 958\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblock_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mtext_kwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 959\u001b[0m )\n\u001b[1;32m 960\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
29 |
+
"\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/spec.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, path, mode, block_size, cache_options, **kwargs)\u001b[0m\n\u001b[1;32m 960\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 961\u001b[0m \u001b[0mac\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"autocommit\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_intrans\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 962\u001b[0;31m f = self._open(\n\u001b[0m\u001b[1;32m 963\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
30 |
+
"\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/implementations/local.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self, path, mode, block_size, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_mkdir\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\"w\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 144\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mLocalFileOpener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 145\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 146\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mtouch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
31 |
+
"\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/implementations/local.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, mode, autocommit, fs, compression, **kwargs)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompression\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_compression\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcompression\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblocksize\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEFAULT_BUFFER_SIZE\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
32 |
+
"\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/implementations/local.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclosed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautocommit\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m\"w\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 241\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompression\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0mcompress\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompression\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
33 |
+
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/yeb/grouped_dataset/dataset_dict.json'"
|
34 |
+
]
|
35 |
+
}
|
36 |
+
],
|
37 |
"source": [
|
38 |
+
"datasets = DatasetDict.load_from_disk(\"/home/yeb/grouped_dataset\")"
|
39 |
]
|
40 |
},
|
41 |
{
|
flax_model.msgpack
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 891548548
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a2f203c6e4fb395cd1af46eddc3fdbb688e995e9d67023a95fec04d3a338d3d
|
3 |
size 891548548
|
opt_state.msgpack
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1985609
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e32d952b358ab33b2ce966fdff36cd2b253376e77f7867b0d920ea8926c0c08a
|
3 |
size 1985609
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 891650495
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa8fa3e8f3ce483fb79cee84d6a4e2057ad2706010fb522c44cad201f8c0af66
|
3 |
size 891650495
|
run_t5.sh
CHANGED
@@ -7,6 +7,30 @@ mkdir -p "${MODEL_DIR}/runs"
|
|
7 |
# T5 paper lr 0.01 with batch size 128
|
8 |
# We have a batch size of 8 devices * 32 = 256, so lr = 0.01/2
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
while true; do
|
11 |
|
12 |
# Set the seed to random before each run, so date shuffling per epoch is different each run.
|
@@ -23,17 +47,17 @@ while true; do
|
|
23 |
--do_train --do_eval \
|
24 |
--adafactor \
|
25 |
--max_seq_length="512" \
|
26 |
-
--per_device_train_batch_size="
|
27 |
-
--per_device_eval_batch_size="
|
28 |
-
--learning_rate="5e-3" \
|
29 |
--dtype="bfloat16" \
|
|
|
30 |
--overwrite_output_dir \
|
31 |
--num_train_epochs="3" \
|
32 |
--logging_steps="50" \
|
33 |
-
--save_steps="
|
34 |
-
--eval_steps="
|
35 |
--resume_from_checkpoint="${MODEL_DIR}" \
|
36 |
-
--warmup_steps="
|
37 |
|
38 |
# \
|
39 |
# --push_to_hub
|
|
|
7 |
# T5 paper lr 0.01 with batch size 128
|
8 |
# We have a batch size of 8 devices * 32 = 256, so lr = 0.01/2
|
9 |
|
10 |
+
#SEED=9200
|
11 |
+
#
|
12 |
+
#./run_t5_mlm_flax_custom_dataset.py \
|
13 |
+
# --output_dir="${MODEL_DIR}" \
|
14 |
+
# --model_type="t5" \
|
15 |
+
# --config_name="flax-community/${MODEL}" \
|
16 |
+
# --tokenizer_name="${MODEL_DIR}" \
|
17 |
+
# --seed="${SEED}" \
|
18 |
+
# --preprocessing_num_workers="96" \
|
19 |
+
# --do_train --do_eval \
|
20 |
+
# --adafactor \
|
21 |
+
# --max_seq_length="512" \
|
22 |
+
# --per_device_train_batch_size="32" \
|
23 |
+
# --per_device_eval_batch_size="32" \
|
24 |
+
# --dtype="bfloat16" \
|
25 |
+
# --learning_rate="5e-3" \
|
26 |
+
# --overwrite_output_dir \
|
27 |
+
# --num_train_epochs="3" \
|
28 |
+
# --logging_steps="50" \
|
29 |
+
# --save_steps="100" \
|
30 |
+
# --eval_steps="5000" \
|
31 |
+
# --warmup_steps="3413"
|
32 |
+
#exit
|
33 |
+
|
34 |
while true; do
|
35 |
|
36 |
# Set the seed to random before each run, so date shuffling per epoch is different each run.
|
|
|
47 |
--do_train --do_eval \
|
48 |
--adafactor \
|
49 |
--max_seq_length="512" \
|
50 |
+
--per_device_train_batch_size="16" \
|
51 |
+
--per_device_eval_batch_size="16" \
|
|
|
52 |
--dtype="bfloat16" \
|
53 |
+
--learning_rate="1e-2" \
|
54 |
--overwrite_output_dir \
|
55 |
--num_train_epochs="3" \
|
56 |
--logging_steps="50" \
|
57 |
+
--save_steps="500" \
|
58 |
+
--eval_steps="5000" \
|
59 |
--resume_from_checkpoint="${MODEL_DIR}" \
|
60 |
+
--warmup_steps="6519"
|
61 |
|
62 |
# \
|
63 |
# --push_to_hub
|
run_t5_mlm_flax_custom_dataset.py
CHANGED
@@ -645,7 +645,12 @@ if __name__ == "__main__":
|
|
645 |
|
646 |
# Preprocessing the datasets.
|
647 |
# First we tokenize all the texts.
|
648 |
-
if
|
|
|
|
|
|
|
|
|
|
|
649 |
if training_args.do_train:
|
650 |
column_names = datasets["train"].column_names
|
651 |
else:
|
@@ -696,9 +701,6 @@ if __name__ == "__main__":
|
|
696 |
num_proc=data_args.preprocessing_num_workers,
|
697 |
load_from_cache_file=not data_args.overwrite_cache,
|
698 |
)
|
699 |
-
else:
|
700 |
-
logger.info("Loading tokenized and grouped dataset")
|
701 |
-
tokenized_datasets = DatasetDict.load_from_disk("/home/yeb/grouped_datasets")
|
702 |
|
703 |
# Enable tensorboard only on the master node
|
704 |
has_tensorboard = is_tensorboard_available()
|
@@ -904,8 +906,8 @@ if __name__ == "__main__":
|
|
904 |
for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
|
905 |
cur_step = epoch * (num_train_samples // train_batch_size) + step
|
906 |
# skip to the step from which we are resuming
|
907 |
-
#
|
908 |
-
#
|
909 |
|
910 |
samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
|
911 |
model_inputs = data_collator(samples)
|
|
|
645 |
|
646 |
# Preprocessing the datasets.
|
647 |
# First we tokenize all the texts.
|
648 |
+
if load_grouped:
|
649 |
+
logger.info("Loading tokenized and grouped dataset")
|
650 |
+
tokenized_datasets = DatasetDict.load_from_disk("/home/yeb/grouped_datasets")
|
651 |
+
logger.info("Setting max validation examples to 500")
|
652 |
+
tokenized_datasets['validation'] = tokenized_datasets['validation'].select(range(500))
|
653 |
+
else:
|
654 |
if training_args.do_train:
|
655 |
column_names = datasets["train"].column_names
|
656 |
else:
|
|
|
701 |
num_proc=data_args.preprocessing_num_workers,
|
702 |
load_from_cache_file=not data_args.overwrite_cache,
|
703 |
)
|
|
|
|
|
|
|
704 |
|
705 |
# Enable tensorboard only on the master node
|
706 |
has_tensorboard = is_tensorboard_available()
|
|
|
906 |
for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
|
907 |
cur_step = epoch * (num_train_samples // train_batch_size) + step
|
908 |
# skip to the step from which we are resuming
|
909 |
+
# if cur_step < resume_step:
|
910 |
+
# continue
|
911 |
|
912 |
samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
|
913 |
model_inputs = data_collator(samples)
|
runs/Jul11_12-53-41_t1v-n-0e7426e8-w-0/events.out.tfevents.1626008983.t1v-n-0e7426e8-w-0.161493.3.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:03ddeed93b5615c1239be282f05bf781971c8a799be72c9bebc4de1d596fbd63
|
3 |
-
size 585827
|
|
|
|
|
|
|
|
runs/Jul12_06-43-08_t1v-n-0e7426e8-w-0/events.out.tfevents.1626072193.t1v-n-0e7426e8-w-0.238699.3.v2
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:9f5f6fcc83f8cf7fac87cc276fa00a02c9ce4e252c6bb69a3988452bed73f67e
|
3 |
-
size 200238
|
|
|
|
|
|
|
|
runs/Jul13_10-43-12_t1v-n-0e7426e8-w-0/events.out.tfevents.1626172997.t1v-n-0e7426e8-w-0.622440.3.v2
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:011c4912e19d0d4a05cbfc65d26cf4419ffbb740c164af6da614e90b79e811f4
|
3 |
+
size 370875
|
runs/{Jul10_08-38-10_t1v-n-0e7426e8-w-0/events.out.tfevents.1625906314.t1v-n-0e7426e8-w-0.25839.3.v2 → Jul13_13-19-07_t1v-n-0e7426e8-w-0/events.out.tfevents.1626182353.t1v-n-0e7426e8-w-0.634946.3.v2}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e557ecca8b2123ad7aeeae0800a959420c4d753e0700aa9b0a824e61f7a657b6
|
3 |
+
size 1060976
|
runs/{Jul09_21-43-10_t1v-n-0e7426e8-w-0/events.out.tfevents.1625867209.t1v-n-0e7426e8-w-0.420316.3.v2 → Jul13_20-26-47_t1v-n-0e7426e8-w-0/events.out.tfevents.1626208012.t1v-n-0e7426e8-w-0.683072.3.v2}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f51d5afdd39639a5997bd48251851300f4761d25534c61df0149123f3d7beaf3
|
3 |
+
size 348618
|
runs/{Jul11_09-15-07_t1v-n-0e7426e8-w-0/events.out.tfevents.1625995853.t1v-n-0e7426e8-w-0.145718.3.v2 → Jul13_21-28-58_t1v-n-0e7426e8-w-0/events.out.tfevents.1626211744.t1v-n-0e7426e8-w-0.688568.3.v2}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5874afd27d6f8234168e30694876b6bda7c58a0cdeb56e640a8ee61d6c3f3bbb
|
3 |
+
size 2975368
|
runs/{Jul10_07-45-49_t1v-n-0e7426e8-w-0/events.out.tfevents.1625903173.t1v-n-0e7426e8-w-0.20563.3.v2 → Jul14_05-04-35_t1v-n-0e7426e8-w-0/events.out.tfevents.1626239081.t1v-n-0e7426e8-w-0.720177.3.v2}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c75c45dc32377c52f7a7b2df50ceded99c6098035185d14135132e0626683835
|
3 |
+
size 51858
|
runs/Jul14_05-21-55_t1v-n-0e7426e8-w-0/events.out.tfevents.1626240121.t1v-n-0e7426e8-w-0.722772.3.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fec605fd70b4cc49e1a473d3cd871b834a2f711519812618dded0d6f397c4daf
|
3 |
+
size 1335479
|
streaming_dataset_filter_test.py
CHANGED
@@ -4,26 +4,90 @@ from datasets import load_dataset
|
|
4 |
|
5 |
dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl", split='train', streaming=True)
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def f(obj):
|
9 |
obj["text"] = clean_text(obj["text"])
|
10 |
return obj
|
11 |
|
12 |
|
13 |
-
dataset_v1 = dataset_v0.map(
|
14 |
-
|
|
|
|
|
|
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
|
21 |
-
it = iter(
|
22 |
print(next(it))
|
23 |
print(next(it))
|
24 |
print(next(it))
|
25 |
|
26 |
-
it = iter(
|
27 |
print(next(it))
|
28 |
print(next(it))
|
29 |
print(next(it))
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl", split='train', streaming=True)
|
6 |
|
7 |
+
# data_dir = "/home/yeb"
|
8 |
+
data_dir = "/home/yeb/Developer/data"
|
9 |
+
data_files = []
|
10 |
+
|
11 |
+
def train_val_files():
|
12 |
+
import glob
|
13 |
+
import random
|
14 |
+
SEED = 12345
|
15 |
+
|
16 |
+
def add_jsonlines_dir(path, filespec):
|
17 |
+
global data_files
|
18 |
+
data_files += glob.glob(f"{path}/{filespec}")
|
19 |
+
data_files = list(set(data_files))
|
20 |
+
print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
|
21 |
+
|
22 |
+
# add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
|
23 |
+
add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*73*.gz")
|
24 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*47*.gz")
|
25 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*12*.gz")
|
26 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*29*.gz")
|
27 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*74*.gz")
|
28 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*26*.gz")
|
29 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*54*.gz")
|
30 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*68*.gz")
|
31 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*57*.gz")
|
32 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*46*.gz")
|
33 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*35*.gz")
|
34 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*13*.gz")
|
35 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*41*.gz")
|
36 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*52*.gz")
|
37 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*63*.gz")
|
38 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*85*.gz")
|
39 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*81*.gz")
|
40 |
+
# add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*96*.gz")
|
41 |
+
# add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
|
42 |
+
# add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
|
43 |
+
random.Random(SEED).shuffle(data_files)
|
44 |
+
|
45 |
+
total = len(data_files)
|
46 |
+
print(total)
|
47 |
+
perc = 0.05
|
48 |
+
val_size = int(perc * total)
|
49 |
+
train_size = total - val_size
|
50 |
+
train = data_files[:train_size]
|
51 |
+
val = data_files[train_size:]
|
52 |
+
print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
|
53 |
+
|
54 |
+
assert list(set(train) & set(val)) == [], "Train overlaps with test"
|
55 |
+
|
56 |
+
return train, val
|
57 |
+
|
58 |
+
train, val = train_val_files()
|
59 |
+
dataset_v0 = load_dataset('json', data_files={'train': train, 'validation': val})
|
60 |
+
|
61 |
+
|
62 |
+
dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl")
|
63 |
|
64 |
def f(obj):
|
65 |
obj["text"] = clean_text(obj["text"])
|
66 |
return obj
|
67 |
|
68 |
|
69 |
+
dataset_v1 = dataset_v0.map(
|
70 |
+
f,
|
71 |
+
batched=False,
|
72 |
+
num_proc=10,
|
73 |
+
)
|
74 |
|
75 |
+
datasets = dataset_v1.filter(
|
76 |
+
lambda obj: obj['text'] is not None,
|
77 |
+
num_proc=10,
|
78 |
+
)
|
79 |
|
80 |
+
it = iter(dataset_v0['train'])
|
81 |
print(next(it))
|
82 |
print(next(it))
|
83 |
print(next(it))
|
84 |
|
85 |
+
it = iter(dataset_v1['train'])
|
86 |
print(next(it))
|
87 |
print(next(it))
|
88 |
print(next(it))
|
89 |
+
|
90 |
+
# it = iter(dataset_v2)
|
91 |
+
# print(next(it))
|
92 |
+
# print(next(it))
|
93 |
+
# print(next(it))
|
training_state.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"step":
|
|
|
1 |
+
{"step": 30204}
|