yhavinga commited on
Commit
34a1eb7
•
1 Parent(s): f95015e

Update weights and scripts

Browse files
Files changed (16) hide show
  1. Load_preprocessed_dataset.ipynb +22 -4
  2. flax_model.msgpack +1 -1
  3. opt_state.msgpack +1 -1
  4. pytorch_model.bin +1 -1
  5. run_t5.sh +30 -6
  6. run_t5_mlm_flax_custom_dataset.py +8 -6
  7. runs/Jul11_12-53-41_t1v-n-0e7426e8-w-0/events.out.tfevents.1626008983.t1v-n-0e7426e8-w-0.161493.3.v2 +0 -3
  8. runs/Jul12_06-43-08_t1v-n-0e7426e8-w-0/events.out.tfevents.1626072193.t1v-n-0e7426e8-w-0.238699.3.v2 +0 -3
  9. runs/Jul13_10-43-12_t1v-n-0e7426e8-w-0/events.out.tfevents.1626172997.t1v-n-0e7426e8-w-0.622440.3.v2 +2 -2
  10. runs/{Jul10_08-38-10_t1v-n-0e7426e8-w-0/events.out.tfevents.1625906314.t1v-n-0e7426e8-w-0.25839.3.v2 → Jul13_13-19-07_t1v-n-0e7426e8-w-0/events.out.tfevents.1626182353.t1v-n-0e7426e8-w-0.634946.3.v2} +2 -2
  11. runs/{Jul09_21-43-10_t1v-n-0e7426e8-w-0/events.out.tfevents.1625867209.t1v-n-0e7426e8-w-0.420316.3.v2 → Jul13_20-26-47_t1v-n-0e7426e8-w-0/events.out.tfevents.1626208012.t1v-n-0e7426e8-w-0.683072.3.v2} +2 -2
  12. runs/{Jul11_09-15-07_t1v-n-0e7426e8-w-0/events.out.tfevents.1625995853.t1v-n-0e7426e8-w-0.145718.3.v2 → Jul13_21-28-58_t1v-n-0e7426e8-w-0/events.out.tfevents.1626211744.t1v-n-0e7426e8-w-0.688568.3.v2} +2 -2
  13. runs/{Jul10_07-45-49_t1v-n-0e7426e8-w-0/events.out.tfevents.1625903173.t1v-n-0e7426e8-w-0.20563.3.v2 → Jul14_05-04-35_t1v-n-0e7426e8-w-0/events.out.tfevents.1626239081.t1v-n-0e7426e8-w-0.720177.3.v2} +2 -2
  14. runs/Jul14_05-21-55_t1v-n-0e7426e8-w-0/events.out.tfevents.1626240121.t1v-n-0e7426e8-w-0.722772.3.v2 +3 -0
  15. streaming_dataset_filter_test.py +72 -8
  16. training_state.json +1 -1
Load_preprocessed_dataset.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 10,
6
  "id": "cf148030-7287-4c9e-ae32-8d1e1c47be30",
7
  "metadata": {},
8
  "outputs": [],
@@ -12,12 +12,30 @@
12
  },
13
  {
14
  "cell_type": "code",
15
- "execution_count": 11,
16
  "id": "5161b4ba-e8cf-43e1-b67e-503c29aa4271",
17
  "metadata": {},
18
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "source": [
20
- "datasets = DatasetDict.load_from_disk(\"./grouped_dataset\")"
21
  ]
22
  },
23
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 4,
6
  "id": "cf148030-7287-4c9e-ae32-8d1e1c47be30",
7
  "metadata": {},
8
  "outputs": [],
 
12
  },
13
  {
14
  "cell_type": "code",
15
+ "execution_count": 7,
16
  "id": "5161b4ba-e8cf-43e1-b67e-503c29aa4271",
17
  "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "ename": "FileNotFoundError",
21
+ "evalue": "[Errno 2] No such file or directory: '/home/yeb/grouped_dataset/dataset_dict.json'",
22
+ "output_type": "error",
23
+ "traceback": [
24
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
25
+ "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
26
+ "\u001b[0;32m/tmp/ipykernel_574434/3668239933.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdatasets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDatasetDict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_from_disk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/yeb/grouped_dataset\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
27
+ "\u001b[0;32m~/datasets/src/datasets/dataset_dict.py\u001b[0m in \u001b[0;36mload_from_disk\u001b[0;34m(dataset_dict_path, fs, keep_in_memory)\u001b[0m\n\u001b[1;32m 727\u001b[0m \u001b[0;34mf\"No such file or directory: '{dataset_dict_json_path}'. Expected to load a DatasetDict object, but got a Dataset. Please use datasets.load_from_disk instead.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 728\u001b[0m )\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset_dict_json_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"splits\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m dataset_dict_split_path = (\n\u001b[1;32m 731\u001b[0m \u001b[0mdataset_dict_path\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"://\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"://\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdest_dataset_dict_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_posix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
28
+ "\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/spec.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, path, mode, block_size, cache_options, **kwargs)\u001b[0m\n\u001b[1;32m 956\u001b[0m }\n\u001b[1;32m 957\u001b[0m return io.TextIOWrapper(\n\u001b[0;32m--> 958\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblock_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mtext_kwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 959\u001b[0m )\n\u001b[1;32m 960\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
29
+ "\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/spec.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, path, mode, block_size, cache_options, **kwargs)\u001b[0m\n\u001b[1;32m 960\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 961\u001b[0m \u001b[0mac\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"autocommit\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_intrans\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 962\u001b[0;31m f = self._open(\n\u001b[0m\u001b[1;32m 963\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
30
+ "\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/implementations/local.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self, path, mode, block_size, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_mkdir\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\"w\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 144\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mLocalFileOpener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 145\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 146\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mtouch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
31
+ "\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/implementations/local.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path, mode, autocommit, fs, compression, **kwargs)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompression\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_compression\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcompression\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblocksize\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDEFAULT_BUFFER_SIZE\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
32
+ "\u001b[0;32m~/venv/lib/python3.8/site-packages/fsspec/implementations/local.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclosed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautocommit\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m\"w\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 241\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompression\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0mcompress\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompression\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
33
+ "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/home/yeb/grouped_dataset/dataset_dict.json'"
34
+ ]
35
+ }
36
+ ],
37
  "source": [
38
+ "datasets = DatasetDict.load_from_disk(\"/home/yeb/grouped_dataset\")"
39
  ]
40
  },
41
  {
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e813a6c7760ec8638a3cf5e19b5148dad0aa0761b3c6ea82f0f74352b0308057
3
  size 891548548
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a2f203c6e4fb395cd1af46eddc3fdbb688e995e9d67023a95fec04d3a338d3d
3
  size 891548548
opt_state.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c43dcbbf24d445c2dbbc45e8fc6df64c4f79f286c21b9165e1b7d6510d498519
3
  size 1985609
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e32d952b358ab33b2ce966fdff36cd2b253376e77f7867b0d920ea8926c0c08a
3
  size 1985609
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30e5deffc7b2da9d2bba7f7b471916fd126caf80f4c0a8b1204c8f7dc62fbc3e
3
  size 891650495
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa8fa3e8f3ce483fb79cee84d6a4e2057ad2706010fb522c44cad201f8c0af66
3
  size 891650495
run_t5.sh CHANGED
@@ -7,6 +7,30 @@ mkdir -p "${MODEL_DIR}/runs"
7
  # T5 paper lr 0.01 with batch size 128
8
  # We have a batch size of 8 devices * 32 = 256, so lr = 0.01/2
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  while true; do
11
 
12
  # Set the seed to random before each run, so date shuffling per epoch is different each run.
@@ -23,17 +47,17 @@ while true; do
23
  --do_train --do_eval \
24
  --adafactor \
25
  --max_seq_length="512" \
26
- --per_device_train_batch_size="32" \
27
- --per_device_eval_batch_size="32" \
28
- --learning_rate="5e-3" \
29
  --dtype="bfloat16" \
 
30
  --overwrite_output_dir \
31
  --num_train_epochs="3" \
32
  --logging_steps="50" \
33
- --save_steps="501" \
34
- --eval_steps="10000000" \
35
  --resume_from_checkpoint="${MODEL_DIR}" \
36
- --warmup_steps="3413"
37
 
38
  # \
39
  # --push_to_hub
 
7
  # T5 paper lr 0.01 with batch size 128
8
  # We have a batch size of 8 devices * 32 = 256, so lr = 0.01/2
9
 
10
+ #SEED=9200
11
+ #
12
+ #./run_t5_mlm_flax_custom_dataset.py \
13
+ # --output_dir="${MODEL_DIR}" \
14
+ # --model_type="t5" \
15
+ # --config_name="flax-community/${MODEL}" \
16
+ # --tokenizer_name="${MODEL_DIR}" \
17
+ # --seed="${SEED}" \
18
+ # --preprocessing_num_workers="96" \
19
+ # --do_train --do_eval \
20
+ # --adafactor \
21
+ # --max_seq_length="512" \
22
+ # --per_device_train_batch_size="32" \
23
+ # --per_device_eval_batch_size="32" \
24
+ # --dtype="bfloat16" \
25
+ # --learning_rate="5e-3" \
26
+ # --overwrite_output_dir \
27
+ # --num_train_epochs="3" \
28
+ # --logging_steps="50" \
29
+ # --save_steps="100" \
30
+ # --eval_steps="5000" \
31
+ # --warmup_steps="3413"
32
+ #exit
33
+
34
  while true; do
35
 
36
  # Set the seed to random before each run, so date shuffling per epoch is different each run.
 
47
  --do_train --do_eval \
48
  --adafactor \
49
  --max_seq_length="512" \
50
+ --per_device_train_batch_size="16" \
51
+ --per_device_eval_batch_size="16" \
 
52
  --dtype="bfloat16" \
53
+ --learning_rate="1e-2" \
54
  --overwrite_output_dir \
55
  --num_train_epochs="3" \
56
  --logging_steps="50" \
57
+ --save_steps="500" \
58
+ --eval_steps="5000" \
59
  --resume_from_checkpoint="${MODEL_DIR}" \
60
+ --warmup_steps="6519"
61
 
62
  # \
63
  # --push_to_hub
run_t5_mlm_flax_custom_dataset.py CHANGED
@@ -645,7 +645,12 @@ if __name__ == "__main__":
645
 
646
  # Preprocessing the datasets.
647
  # First we tokenize all the texts.
648
- if not load_grouped:
 
 
 
 
 
649
  if training_args.do_train:
650
  column_names = datasets["train"].column_names
651
  else:
@@ -696,9 +701,6 @@ if __name__ == "__main__":
696
  num_proc=data_args.preprocessing_num_workers,
697
  load_from_cache_file=not data_args.overwrite_cache,
698
  )
699
- else:
700
- logger.info("Loading tokenized and grouped dataset")
701
- tokenized_datasets = DatasetDict.load_from_disk("/home/yeb/grouped_datasets")
702
 
703
  # Enable tensorboard only on the master node
704
  has_tensorboard = is_tensorboard_available()
@@ -904,8 +906,8 @@ if __name__ == "__main__":
904
  for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
905
  cur_step = epoch * (num_train_samples // train_batch_size) + step
906
  # skip to the step from which we are resuming
907
- # if cur_step < resume_step:
908
- # continue
909
 
910
  samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
911
  model_inputs = data_collator(samples)
 
645
 
646
  # Preprocessing the datasets.
647
  # First we tokenize all the texts.
648
+ if load_grouped:
649
+ logger.info("Loading tokenized and grouped dataset")
650
+ tokenized_datasets = DatasetDict.load_from_disk("/home/yeb/grouped_datasets")
651
+ logger.info("Setting max validation examples to 500")
652
+ tokenized_datasets['validation'] = tokenized_datasets['validation'].select(range(500))
653
+ else:
654
  if training_args.do_train:
655
  column_names = datasets["train"].column_names
656
  else:
 
701
  num_proc=data_args.preprocessing_num_workers,
702
  load_from_cache_file=not data_args.overwrite_cache,
703
  )
 
 
 
704
 
705
  # Enable tensorboard only on the master node
706
  has_tensorboard = is_tensorboard_available()
 
906
  for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
907
  cur_step = epoch * (num_train_samples // train_batch_size) + step
908
  # skip to the step from which we are resuming
909
+ # if cur_step < resume_step:
910
+ # continue
911
 
912
  samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
913
  model_inputs = data_collator(samples)
runs/Jul11_12-53-41_t1v-n-0e7426e8-w-0/events.out.tfevents.1626008983.t1v-n-0e7426e8-w-0.161493.3.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:03ddeed93b5615c1239be282f05bf781971c8a799be72c9bebc4de1d596fbd63
3
- size 585827
 
 
 
 
runs/Jul12_06-43-08_t1v-n-0e7426e8-w-0/events.out.tfevents.1626072193.t1v-n-0e7426e8-w-0.238699.3.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f5f6fcc83f8cf7fac87cc276fa00a02c9ce4e252c6bb69a3988452bed73f67e
3
- size 200238
 
 
 
 
runs/Jul13_10-43-12_t1v-n-0e7426e8-w-0/events.out.tfevents.1626172997.t1v-n-0e7426e8-w-0.622440.3.v2 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cd61608c1fe7f600b38022829414d08eca065022eeb12ac3f3d4590930ca124
3
- size 96372
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:011c4912e19d0d4a05cbfc65d26cf4419ffbb740c164af6da614e90b79e811f4
3
+ size 370875
runs/{Jul10_08-38-10_t1v-n-0e7426e8-w-0/events.out.tfevents.1625906314.t1v-n-0e7426e8-w-0.25839.3.v2 → Jul13_13-19-07_t1v-n-0e7426e8-w-0/events.out.tfevents.1626182353.t1v-n-0e7426e8-w-0.634946.3.v2} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59007128de97ddf2f570d88ff0648750c666c92c091b6c87561e3fb035afb4dd
3
- size 259155
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e557ecca8b2123ad7aeeae0800a959420c4d753e0700aa9b0a824e61f7a657b6
3
+ size 1060976
runs/{Jul09_21-43-10_t1v-n-0e7426e8-w-0/events.out.tfevents.1625867209.t1v-n-0e7426e8-w-0.420316.3.v2 → Jul13_20-26-47_t1v-n-0e7426e8-w-0/events.out.tfevents.1626208012.t1v-n-0e7426e8-w-0.683072.3.v2} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae21cbc674e3cae9cab24545f299bd4954fde9ac83daed7eca6e6b8ff17aa26e
3
- size 524288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f51d5afdd39639a5997bd48251851300f4761d25534c61df0149123f3d7beaf3
3
+ size 348618
runs/{Jul11_09-15-07_t1v-n-0e7426e8-w-0/events.out.tfevents.1625995853.t1v-n-0e7426e8-w-0.145718.3.v2 → Jul13_21-28-58_t1v-n-0e7426e8-w-0/events.out.tfevents.1626211744.t1v-n-0e7426e8-w-0.688568.3.v2} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1aed2c3d6af21a006e34d03f0d6071dd41995d510e1cee680aad026186ca98f
3
- size 303949
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5874afd27d6f8234168e30694876b6bda7c58a0cdeb56e640a8ee61d6c3f3bbb
3
+ size 2975368
runs/{Jul10_07-45-49_t1v-n-0e7426e8-w-0/events.out.tfevents.1625903173.t1v-n-0e7426e8-w-0.20563.3.v2 → Jul14_05-04-35_t1v-n-0e7426e8-w-0/events.out.tfevents.1626239081.t1v-n-0e7426e8-w-0.720177.3.v2} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07af3cc1b9177077621a9f9cd2b3d36aa028237973caf686c9e324e89cc25de9
3
- size 147729
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c75c45dc32377c52f7a7b2df50ceded99c6098035185d14135132e0626683835
3
+ size 51858
runs/Jul14_05-21-55_t1v-n-0e7426e8-w-0/events.out.tfevents.1626240121.t1v-n-0e7426e8-w-0.722772.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fec605fd70b4cc49e1a473d3cd871b834a2f711519812618dded0d6f397c4daf
3
+ size 1335479
streaming_dataset_filter_test.py CHANGED
@@ -4,26 +4,90 @@ from datasets import load_dataset
4
 
5
  dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl", split='train', streaming=True)
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def f(obj):
9
  obj["text"] = clean_text(obj["text"])
10
  return obj
11
 
12
 
13
- dataset_v1 = dataset_v0.map(f)
14
- dataset_v2 = dataset_v1.filter(lambda obj: obj['text'] is not None)
 
 
 
15
 
16
- it = iter(dataset_v0)
17
- print(next(it))
18
- print(next(it))
19
- print(next(it))
20
 
21
- it = iter(dataset_v1)
22
  print(next(it))
23
  print(next(it))
24
  print(next(it))
25
 
26
- it = iter(dataset_v2)
27
  print(next(it))
28
  print(next(it))
29
  print(next(it))
 
 
 
 
 
 
4
 
5
  dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl", split='train', streaming=True)
6
 
7
+ # data_dir = "/home/yeb"
8
+ data_dir = "/home/yeb/Developer/data"
9
+ data_files = []
10
+
11
+ def train_val_files():
12
+ import glob
13
+ import random
14
+ SEED = 12345
15
+
16
+ def add_jsonlines_dir(path, filespec):
17
+ global data_files
18
+ data_files += glob.glob(f"{path}/{filespec}")
19
+ data_files = list(set(data_files))
20
+ print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
21
+
22
+ # add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
23
+ add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*73*.gz")
24
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*47*.gz")
25
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*12*.gz")
26
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*29*.gz")
27
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*74*.gz")
28
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*26*.gz")
29
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*54*.gz")
30
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*68*.gz")
31
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*57*.gz")
32
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*46*.gz")
33
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*35*.gz")
34
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*13*.gz")
35
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*41*.gz")
36
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*52*.gz")
37
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*63*.gz")
38
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*85*.gz")
39
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*81*.gz")
40
+ # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*96*.gz")
41
+ # add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
42
+ # add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
43
+ random.Random(SEED).shuffle(data_files)
44
+
45
+ total = len(data_files)
46
+ print(total)
47
+ perc = 0.05
48
+ val_size = int(perc * total)
49
+ train_size = total - val_size
50
+ train = data_files[:train_size]
51
+ val = data_files[train_size:]
52
+ print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
53
+
54
+ assert list(set(train) & set(val)) == [], "Train overlaps with test"
55
+
56
+ return train, val
57
+
58
+ train, val = train_val_files()
59
+ dataset_v0 = load_dataset('json', data_files={'train': train, 'validation': val})
60
+
61
+
62
+ dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl")
63
 
64
  def f(obj):
65
  obj["text"] = clean_text(obj["text"])
66
  return obj
67
 
68
 
69
+ dataset_v1 = dataset_v0.map(
70
+ f,
71
+ batched=False,
72
+ num_proc=10,
73
+ )
74
 
75
+ datasets = dataset_v1.filter(
76
+ lambda obj: obj['text'] is not None,
77
+ num_proc=10,
78
+ )
79
 
80
+ it = iter(dataset_v0['train'])
81
  print(next(it))
82
  print(next(it))
83
  print(next(it))
84
 
85
+ it = iter(dataset_v1['train'])
86
  print(next(it))
87
  print(next(it))
88
  print(next(it))
89
+
90
+ # it = iter(dataset_v2)
91
+ # print(next(it))
92
+ # print(next(it))
93
+ # print(next(it))
training_state.json CHANGED
@@ -1 +1 @@
1
- {"step": 502}
 
1
+ {"step": 30204}