diff --git "a/Load data & train tokenizer.ipynb" "b/Load data & train tokenizer.ipynb" --- "a/Load data & train tokenizer.ipynb" +++ "b/Load data & train tokenizer.ipynb" @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 71, "id": "723b5d4d", "metadata": {}, "outputs": [], @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 72, "id": "f4a5edee", "metadata": {}, "outputs": [], @@ -42,6 +42,55 @@ "from transformers import AutoConfig\n" ] }, + { + "cell_type": "code", + "execution_count": 74, + "id": "9241a429", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "348a4dd4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + } + ], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(\"./\")" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "595f318e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "36450" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.vocab_size" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -66,23 +115,1049 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "id": "39b9fc3d", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "2848" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len([x for x in data_files if isinstance(x, str)])" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "id": "ba855add", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "['/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00943-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00018-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01012-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00625-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00070-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00108-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00315-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00056-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00140-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00128-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00221-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00394-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00469-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00547-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00444-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00000-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00129-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00229-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00335-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00792-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00090-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00584-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00986-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00618-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00824-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00114-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00034-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00465-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00185-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01013-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00310-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00071-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00030-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00132-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00074-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00480-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00460-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00847-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00783-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00141-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00967-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00145-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00586-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00188-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00745-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00047-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00850-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00124-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00952-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00333-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00005-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00760-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00882-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00581-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00164-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00120-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00509-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00167-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00180-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00017-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00167-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01004-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00756-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00728-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00033-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00551-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00132-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00231-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00924-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00725-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00362-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00123-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01000-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00161-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00344-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00213-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00721-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01011-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00446-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00235-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00061-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00671-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00294-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00177-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00081-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00407-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00113-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00030-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00293-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00147-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00698-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00598-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00006-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00354-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00860-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00841-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00481-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00129-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00109-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00478-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00667-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00390-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00525-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00449-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00126-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00016-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00197-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00762-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00247-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00982-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00130-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00184-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00063-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00445-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00116-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00675-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00838-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00726-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00191-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00832-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00110-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00012-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00098-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00257-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00130-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00659-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00355-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00487-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00324-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00459-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00439-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01007-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00706-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00512-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00073-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00051-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00911-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00013-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00987-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00188-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00220-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00885-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00905-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00813-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00326-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01015-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00457-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00562-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00503-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00845-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00755-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00969-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00949-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00668-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00042-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00146-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00302-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00050-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00002-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00068-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00608-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00616-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00573-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00127-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00171-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00149-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00516-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00176-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00451-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00597-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00311-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00747-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00430-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00743-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00561-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00161-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00534-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00029-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00448-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00022-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00096-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00736-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00672-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00533-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00295-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00438-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00776-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00176-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00375-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00298-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00501-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00145-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00152-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00524-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00693-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00252-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00772-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00102-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00468-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00165-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00199-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00409-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00095-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00816-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00136-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00908-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00942-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00611-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00304-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00825-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00880-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00157-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00643-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00602-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00050-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00009-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00995-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00175-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00025-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00147-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00039-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00104-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00140-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00278-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00663-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00058-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00846-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00314-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00486-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00073-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00622-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00153-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00630-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00042-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00740-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00172-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00121-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01023-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00156-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00759-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00148-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00007-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00811-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00270-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00360-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00541-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00176-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00121-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00948-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00628-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00106-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00208-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00172-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00072-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00920-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00006-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00582-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00983-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00594-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00461-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00134-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00118-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00081-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00093-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00277-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00377-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00034-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00424-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00421-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00162-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00488-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01016-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00703-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00748-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00866-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00096-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00170-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00248-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00669-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00436-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00085-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00036-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00058-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00695-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01020-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00817-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00844-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00477-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00224-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00464-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00564-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00442-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00065-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00592-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01017-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00181-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00273-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00957-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00153-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00035-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00543-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00940-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00526-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00275-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00161-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00812-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00858-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00992-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00769-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00015-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00753-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00413-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00435-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00351-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00031-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00180-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00021-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00734-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00854-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00859-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00018-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00219-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00836-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00895-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01009-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00632-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00530-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00508-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00709-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00077-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00578-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00945-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00182-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00139-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00047-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00035-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00035-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00387-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00001-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00804-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00474-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00383-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00181-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01021-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00679-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00338-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00179-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00042-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00961-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00010-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00091-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00423-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00290-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00947-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00133-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00380-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00946-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00604-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00045-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00082-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00493-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00552-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00152-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00732-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00427-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00216-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00746-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00057-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00781-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00918-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00134-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00046-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00453-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00099-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00704-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00361-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00067-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00510-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00088-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00210-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00325-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00605-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00080-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00651-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00367-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00822-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00041-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00358-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00142-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00491-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00892-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00190-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00356-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00068-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00359-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00087-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00452-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00554-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00259-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00086-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00904-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00012-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00105-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00032-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00996-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00192-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00443-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00909-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00938-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00162-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00214-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00447-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00839-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00856-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00476-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00371-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00504-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00253-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00921-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00408-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00000-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00973-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01001-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00048-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00179-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00146-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00261-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00902-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00092-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00126-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00026-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00154-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00640-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00994-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00156-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00228-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00038-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00654-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00852-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00128-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00268-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00933-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00492-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00056-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00232-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00808-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00398-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00401-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00386-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00179-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00650-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00197-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00093-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00114-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00626-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00317-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00336-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00012-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00606-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00340-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00861-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00089-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00515-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00378-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00684-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00647-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00196-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00025-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00266-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00204-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00022-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00097-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00160-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00059-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00113-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00182-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00060-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00368-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00001-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00353-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00062-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00198-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00175-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00026-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00143-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00016-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00008-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00189-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00334-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00071-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00519-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00773-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00159-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00624-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00105-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00109-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00112-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00754-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00184-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01005-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00289-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00136-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00194-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00775-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00768-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00402-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00868-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00827-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00689-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00894-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00802-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00980-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00661-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00523-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00631-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00073-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00490-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00473-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00173-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00319-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00791-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00321-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00194-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00715-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00132-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00100-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00053-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00433-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00455-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00142-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00011-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00585-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00798-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00842-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00050-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00657-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00102-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00750-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00072-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00716-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00702-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00285-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00002-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00020-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00761-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01014-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00553-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00181-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00500-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00287-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00422-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00076-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00511-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00246-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00092-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00159-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00320-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00869-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00194-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00031-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00855-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00158-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00098-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00102-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00686-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00167-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00887-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00737-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00155-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00069-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00016-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00299-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00168-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00462-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00416-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00627-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00567-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00559-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00799-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00364-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00172-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00521-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00187-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00062-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00119-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00030-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00027-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00739-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00041-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00888-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00934-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00019-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00154-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00330-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00786-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00939-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00066-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00043-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00881-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00391-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00112-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00332-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00593-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01022-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00127-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00141-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00629-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00953-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00242-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00054-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00112-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00207-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00990-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00463-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00145-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00009-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00713-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00414-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00119-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00874-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00682-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00150-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00122-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00193-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00535-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00610-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00198-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00023-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00212-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00470-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00678-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00192-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00117-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00040-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00027-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00349-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00576-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00549-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00023-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00612-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00171-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00003-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00502-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00084-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00884-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00309-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00690-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00696-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00784-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00031-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00280-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00697-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00536-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00195-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00712-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00107-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00454-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00150-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00203-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00806-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00999-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00496-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00404-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00857-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00771-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00185-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00045-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00475-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00575-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00879-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00357-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00665-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00057-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00951-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00979-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00906-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00062-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00024-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00495-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00692-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00095-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00014-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00201-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00805-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00039-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00388-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00032-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00589-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00186-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00677-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00411-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00641-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00061-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00079-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00028-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00119-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00527-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00346-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00720-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00829-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00558-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00064-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00676-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00774-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00574-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00899-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00596-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00074-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00069-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00125-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00341-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00456-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00393-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00020-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00258-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00514-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00108-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00271-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00089-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00091-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00146-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00044-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00590-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00008-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00074-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00914-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00296-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00800-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00163-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00190-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00484-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00144-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00100-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00991-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00965-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00507-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00916-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00563-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00269-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00123-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00084-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00889-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00744-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00862-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00777-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00719-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00014-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00087-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00144-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00051-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00196-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00226-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00071-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00028-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00024-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00080-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00428-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00579-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00619-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00607-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00968-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00052-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00020-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00128-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00620-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00372-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00187-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00090-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00099-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00084-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00100-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00171-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00120-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00544-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00997-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00078-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00801-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00041-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00699-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00571-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00054-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00034-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00871-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00710-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00653-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00803-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00107-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00382-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00539-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00155-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00782-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00531-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00912-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00680-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00714-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00052-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00255-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00944-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00494-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00603-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00316-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00049-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00158-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00191-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00010-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00066-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00322-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00250-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00656-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00963-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00262-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00168-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00810-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00072-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00138-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00272-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00313-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00318-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00863-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00077-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00126-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00742-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00738-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00780-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00189-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00658-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00701-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00286-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00568-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00853-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00931-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00717-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00138-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00046-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00059-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00118-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00993-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00158-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00166-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00044-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00864-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00017-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00093-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00638-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00327-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00385-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00988-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00117-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00120-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00237-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00941-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00307-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00157-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00705-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00613-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00014-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00244-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00978-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00113-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00730-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00151-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00583-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00870-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00415-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00183-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00065-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00075-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00193-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00022-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00935-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00376-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00192-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00793-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00149-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00160-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00365-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00412-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00998-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00039-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00621-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00182-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00615-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00040-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00168-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00148-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00117-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00345-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00241-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00013-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00896-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00149-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00061-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00180-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00082-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00114-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00021-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00153-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00420-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00002-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00056-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00007-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00110-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00160-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00018-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00164-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00174-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00555-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00028-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00985-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00397-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00588-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00101-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00066-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00797-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00143-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00103-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00954-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00649-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00722-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00545-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00700-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00254-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00482-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00079-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00550-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00645-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00572-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00837-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00329-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00108-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00635-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00116-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00054-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00283-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00080-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00037-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00483-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00059-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00591-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00694-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00134-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00225-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00206-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00970-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00569-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00169-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00472-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00929-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00130-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00300-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00138-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00890-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00005-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00891-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00913-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00038-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00019-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00518-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00830-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01002-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00363-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00789-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00053-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00683-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00111-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00431-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00223-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00809-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00767-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00642-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00218-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00052-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00685-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00876-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00347-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00027-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00024-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00828-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00075-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00133-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00927-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00831-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00749-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00279-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00005-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00634-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00546-of-01024.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00055-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00849-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00165-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00209-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00029-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00198-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00757-of-01024.json.gz',\n", + " '/data/nrc_uniq_cleaned_20210223/part-00124-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", + " '/data/nu_uniq_cleaned_20210225/part-00105-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00323-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00498-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00795-of-01024.json.gz',\n", + " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00366-of-01024.json.gz',\n", + " ...]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_files" + ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 45, "id": "59076aa7", "metadata": {}, "outputs": [ @@ -90,68 +1165,145 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of files 20 after adding /data/c4_cleaned\n" + "Number of files 2448 after adding /data/c4_cleaned2 glob *.gz\n", + "Number of files 2648 after adding /data/nrc_uniq_cleaned_20210223 glob *.gz\n", + "Number of files 2848 after adding /data/nu_uniq_cleaned_20210225 glob *.gz\n" ] + }, + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "#59G c4_cleaned compressed\n", - "#937M nrc_uniq_cleaned_20210223 compressed\n", - "#410M nu_uniq_cleaned_20210225 compressed\n", - "#9.9G oscar_nl_cleaned compressed\n", - "\n", - "\n", - "\n", - "data_files = []\n", - "SEED=42\n", - "def add_jsonlines_dir(path):\n", - " global data_files\n", - " #data_files += glob.glob(f\"{path}/*47*.gz\")\n", - " #data_files += glob.glob(f\"{path}/*32*.gz\")\n", - " #data_files += glob.glob(f\"{path}/*59*.gz\")\n", - " data_files += glob.glob(f\"{path}/*11*.gz\")\n", - " print(f\"Number of files {len(data_files)} after adding {path}\")\n", - " \n", - "add_jsonlines_dir(\"/data/c4_cleaned\")\n", - "#add_jsonlines_dir(\"/data/nrc_uniq_cleaned_20210223\")\n", - "#add_jsonlines_dir(\"/data/nu_uniq_cleaned_20210225\")\n", - "#add_jsonlines_dir(\"/data/oscar_nl_cleaned\") This one gives an error like field url not in \n", - "\n" + " datafiles = []\n", + " import glob\n", + " import random\n", + " SEED = 12345\n", + " def add_jsonlines_dir(path, filespec):\n", + " global data_files\n", + " data_files += glob.glob(f\"{path}/{filespec}\")\n", + " data_files = list(set(data_files))\n", + " print(f\"Number of files {len(data_files)} after adding {path} glob {filespec}\")\n", + " add_jsonlines_dir(f\"/data/c4_cleaned2\", \"*.gz\")\n", + " add_jsonlines_dir(f\"/data/nrc_uniq_cleaned_20210223\", \"*.gz\")\n", + " add_jsonlines_dir(f\"/data/nu_uniq_cleaned_20210225\", \"*.gz\")\n", + " datafiles" ] }, { "cell_type": "code", - "execution_count": 40, - "id": "fc9519d2", + "execution_count": 38, + "id": "7c5980cd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Number of files 209 after adding /data/oscar_nl_cleaned\n", - "95%: 199\n", - "Got 199 training files and 10 validation files\n" + "Number of files 1424 after adding /data/c4_cleaned2 glob *.gz\n", + "Number of files 1424 after adding /data/nrc_uniq_cleaned_20210223 glob *.gz\n", + "Number of files 1424 after adding /data/nu_uniq_cleaned_20210225 glob *.gz\n" + ] + } + ], + "source": [ + "train_val_files()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "969b8fa4", + "metadata": {}, + "outputs": [], + "source": [ + " random.Random(SEED).shuffle(data_files)\n", + " total = len(data_files)\n", + " print(total)\n", + " perc = 0.05\n", + " val_size = int(perc * total)\n", + " train_size = total - val_size\n", + " train = data_files[:train_size]\n", + " val = data_files[train_size:]\n", + " print(f\"Got {len(train)} training files and {perc*100} % {len(val)} validation files\")\n", + " assert list(set(train) & set(val)) == [], \"Train overlaps with test\"\n", + " return train, val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f92c2b13", + "metadata": {}, + "outputs": [], + "source": [ + " datafiles = []\n", + " import glob\n", + " import random\n", + " SEED = 12345\n", + " def add_jsonlines_dir(path, filespec):\n", + " global data_files\n", + " data_files += glob.glob(f\"{path}/{filespec}\")\n", + " data_files = list(set(data_files))\n", + " print(f\"Number of files {len(data_files)} after adding {path} glob {filespec}\")\n", + " add_jsonlines_dir(f\"/data/c4_cleaned2\", \"*.gz\")\n", + " add_jsonlines_dir(f\"/data/nrc_uniq_cleaned_20210223\", \"*.gz\")\n", + " add_jsonlines_dir(f\"/data/nu_uniq_cleaned_20210225\", \"*.gz\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ffb5036", + "metadata": {}, + "outputs": [], + "source": [ + "datasets[\"train\"] = load_dataset(datafiles, split=\"train[5%:]\")\n", + "datasets[\"validation\"] = load_dataset(datafiles, split=\"train[5%:]\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "31e5a164", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of files 1424 after adding /data/c4_cleaned2 glob *.gz\n", + "Number of files 1424 after adding /data/nrc_uniq_cleaned_20210223 glob *.gz\n", + "Number of files 1424 after adding /data/nu_uniq_cleaned_20210225 glob *.gz\n", + "1424\n", + "Got 1353 training files and 5.0 % 71 validation files\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Using custom data configuration default-00e4c1e272015fdb\n" + "WARNING:datasets.builder:Using custom data configuration default-28929211ee23e224\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/dat/.cache/huggingface/datasets/json/default-00e4c1e272015fdb/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723...\n" + "Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/dat/.cache/huggingface/datasets/json/default-28929211ee23e224/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7fc9159a741a4853abb8fa1abcb8bd4c", + "model_id": "e7b73482da6744639826bd7a677f17ff", "version_major": 2, "version_minor": 0 }, @@ -163,24 +1315,141 @@ "output_type": "display_data" }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "db9fc4eb87094fa9aef909f8e8d41124", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "0 tables [00:00, ? tables/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, + "ename": "JSONDecodeError", + "evalue": "Extra data: line 2 column 1 (char 651)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mArrowInvalid\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/packaged_modules/json/json.py\u001b[0m in \u001b[0;36m_generate_tables\u001b[0;34m(self, files)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 81\u001b[0;31m pa_table = paj.read_json(\n\u001b[0m\u001b[1;32m 82\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mread_options\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpa_read_options\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparse_options\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpa_parse_options\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/pyarrow/_json.pyx\u001b[0m in \u001b[0;36mpyarrow._json.read_json\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/pyarrow/error.pxi\u001b[0m in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/pyarrow/error.pxi\u001b[0m in \u001b[0;36mpyarrow.lib.check_status\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mArrowInvalid\u001b[0m: JSON parse error: Missing a closing quotation mark in string. in row 93", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_371965/265278772.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_val_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdatasets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'json'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_files\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'validation'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mval\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/load.py\u001b[0m in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, script_version, use_auth_token, task, streaming, **config_kwargs)\u001b[0m\n\u001b[1;32m 839\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 840\u001b[0m \u001b[0;31m# Download and prepare data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 841\u001b[0;31m builder_instance.download_and_prepare(\n\u001b[0m\u001b[1;32m 842\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 843\u001b[0m \u001b[0mdownload_mode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdownload_mode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/builder.py\u001b[0m in \u001b[0;36mdownload_and_prepare\u001b[0;34m(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m 581\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwarning\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"HF google storage unreachable. Downloading and preparing it from source\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdownloaded_from_gcs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 583\u001b[0;31m self._download_and_prepare(\n\u001b[0m\u001b[1;32m 584\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverify_infos\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverify_infos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mdownload_and_prepare_kwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m )\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[0;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m 659\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 660\u001b[0m \u001b[0;31m# Prepare split will record examples associated to the split\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 661\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msplit_generator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mprepare_split_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 662\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 663\u001b[0m raise OSError(\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/builder.py\u001b[0m in \u001b[0;36m_prepare_split\u001b[0;34m(self, split_generator)\u001b[0m\n\u001b[1;32m 1125\u001b[0m \u001b[0mgenerator\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_generate_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0msplit_generator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgen_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mArrowWriter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfpath\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mwriter\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1127\u001b[0;31m for key, table in utils.tqdm(\n\u001b[0m\u001b[1;32m 1128\u001b[0m \u001b[0mgenerator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\" tables\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mleave\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdisable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbool\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_verbosity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNOTSET\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1129\u001b[0m ):\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/tqdm/notebook.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtqdm_notebook\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;31m# return super(tqdm...) will not catch exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/tqdm/std.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1177\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1178\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1179\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[0;31m# Update and possibly print the progressbar.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/packaged_modules/json/json.py\u001b[0m in \u001b[0;36m_generate_tables\u001b[0;34m(self, files)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArrowInvalid\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m \u001b[0mdataset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 87\u001b[0m raise ValueError(\n\u001b[1;32m 88\u001b[0m \u001b[0;34mf\"Not able to read records in the JSON file at {file}. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python3.8/json/__init__.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[0mkwarg\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0motherwise\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mJSONDecoder\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mused\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \"\"\"\n\u001b[0;32m--> 293\u001b[0;31m return loads(fp.read(),\n\u001b[0m\u001b[1;32m 294\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobject_hook\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mobject_hook\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0mparse_float\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparse_float\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparse_int\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparse_int\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python3.8/json/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0mparse_int\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mparse_float\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 357\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 358\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mJSONDecoder\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python3.8/json/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 340\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Extra data\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 341\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 342\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mJSONDecodeError\u001b[0m: Extra data: line 2 column 1 (char 651)" + ] + } + ], + "source": [ + "train, val = train_val_files()\n", + "datasets = load_dataset('json', data_files={'train': train, 'validation': val})" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "69445179", + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Dataset json downloaded and prepared to /home/dat/.cache/huggingface/datasets/json/default-00e4c1e272015fdb/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723. Subsequent calls will reuse this data.\n" + "Number of files 1024 after adding /data/c4_cleaned2 glob *json.gz\n", + "Number of files 1224 after adding /data/nrc_uniq_cleaned_20210223 glob *.gz\n", + "Number of files 1424 after adding /data/nu_uniq_cleaned_20210225 glob *.gz\n" + ] + } + ], + "source": [ + "import glob\n", + "import random\n", + "SEED = 12345\n", + "data_files = []\n", + "def add_jsonlines_dir(path, filespec):\n", + " global data_files\n", + " data_files += glob.glob(f\"{path}/{filespec}\")\n", + " data_files = list(set(data_files))\n", + " print(f\"Number of files {len(data_files)} after adding {path} glob {filespec}\")\n", + "add_jsonlines_dir(f\"/data/c4_cleaned2\", \"*json.gz\")\n", + "add_jsonlines_dir(f\"/data/nrc_uniq_cleaned_20210223\", \"*.gz\")\n", + "add_jsonlines_dir(f\"/data/nu_uniq_cleaned_20210225\", \"*.gz\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "5b1c04f8", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:datasets.builder:Using custom data configuration default-5aa1f90e962b1369\n", + "WARNING:datasets.builder:Reusing dataset json (/home/dat/.cache/huggingface/datasets/json/default-5aa1f90e962b1369/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723)\n" + ] + } + ], + "source": [ + "datasets = load_dataset('json', data_files={'train': train, 'validation': val})" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "47db602d", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_iterator = iter(datasets['train'])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "03a23a9c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'text': 'Welke school kiezen?\\nSchoolinformatie in je brievenbus ... graag of liever niet?\\nKinderen die geboren zijn tussen 15 november en 31 december 2016 stappen pas in op 1 september 2019.\\nVoor deze groep is inschrijving mogelijk tijdens de eerste voorrangsperiode: van 1 maart tot 16 maart 2018.\\n2: kinderen van personeel van de school.\\nVoor deze groep is inschrijving mogelijk tijdens de tweede voorrangsperiode: op 22 maart en 23 maart 2018.\\nVanaf 18 april tot 4mei 2018 loopt de algemene inschrijvingsperiode en dan kunnen alle kinderen ingeschreven worden.\\nTIP: Wacht niet tot de laatste dag om je kind in te schrijven. Hoe sneller je inschrijft, hoe groter de kans dat er plaats is in de school die je kiest.\\nKies bij voorkeur een school in de buurt van waar je woont; dat heeft enkel maar voordelen. Bezoek zeker vooraf de school van je keuze. Je kan er uitleg vragen en de schoolomgeving ontdekken.\\nIn welke school zijn er vrije plaatsen?\\nVanaf 30 maart 2018 vanaf 17u vind je informatie over de vrije plaatsen op www.lop.be Zo weet je hoeveel plaatsen in jouw school beschikbaar zijn.\\n• Wil je bepaalde scholen bezoeken samen met andere ouders uit je buurt? Neem contact op met School in zicht.\\nMeer informatie over inschrijven in een school vind je ook op de website van Onderwijs Vlaanderen.\\nBlijft het onduidelijk?\\nAlle scholen van Bonheiden, Mechelen, Sint-Katelijne-Waver en Zemst schrijven de kinderen in op hetzelfde moment.\\nIs je kind een jongere broer of zus die in 2016 geboren is? Maak dan een afspraak met de school.\\nIs je kind een oudere broer of zus die voor 2016 geboren is? Ga naar de school tijdens de voorrangsperiode. De inschrijvingen starten op 1 maart om 8u30 stipt en eindigen op 16 maart om 16u. Let op, het aantal plaatsen is in elke school beperkt en wie eerst komt wordt eerst ingeschreven.\\nLet op: vanaf 17 maart heeft de jongere broer/zus niet langer voorrang. Je kan hem/haar dan alleen tijdens de algemene inschrijvingsperiode inschrijven.\\nDe inschrijvingen starten op 18 april 2018 om 8u30 stipt en eindigen op 4 mei2018 om 16u. Let op, het aantal plaatsen is in elke school beperkt en wie eerst komt wordt eerst ingeschreven.\\nNeem de identiteitskaart van je kind of een ander identiteitsbewijs mee (bv. Kids-ID, paspoort, identiteitsbewijs voor kinderen onder 12 jaar).\\nSTAP 1; Je hebt een gesprek met de directeur. Je krijgt informatie over het schoolreglement en het pedagogisch project van de school. Alleen als je akkoord gaat met het schoolreglement en het pedagogisch project kan je je kind inschrijven.\\nHeeft de moeder van het kind een diploma van hoger secundair onderwijs?\\nHeeft het gezin een schooltoelage voor het schooljaar 2017-2018 en/of 2018-2019?\\nAan de hand van de antwoorden wordt je kind ingedeeld als een indicatorleerling of een niet-indicatorleerling.\\nSTAP 3; Op basis van het aantal beschikbare plaatsen wordt je kind wel of niet ingeschreven. Er zijn 3 mogelijkheden.\\nJe kind wordt onmiddellijk ingeschreven in het inschrijvingsregister van de school.\\nEr is nog wel plaats in de school maar niet meer in de groep waarvoor je kind in aanmerking komt.\\n- De directeur kan je niet onmiddellijk zeggen of er uiteindelijk plaats zal zijn of niet.\\n- Je kind komt op de wachtlijst van de school. De wachtlijst geeft geen garantie op een plaats. Als een ander ingeschreven kind de school verlaat, kan de eerste op de wachtlijst zich inschrijven.\\n- Wat moet je doen? Zoek zo snel mogelijk een andere school voor je kind.\\nMechelen heeft een uitgebreid onderwijsaanbod. Op www.mechelen.be/scholen vind je een overzicht.\\nHet stadsbestuur vindt het erg belangrijk dat ouders voldoende geïnformeerd worden over het scholenaanbod in Mechelen en de inschrijvingsprocedure in Mechelse scholen. Daarom krijgen de scholen en een aantal organisaties de mogelijkheid om aan inwoners van Mechelen met schoolgaande kinderen via de post informatie te bezorgen over het basis- of secundair onderwijs en de inschrijvingsprocedure in Mechelse scholen..\\nDeze informatie kan in functie zijn van de leeftijdsgroepen 2,5 jarigen (kleuters), 6 jarigen (lagere school) en de 12 jarigen (secundaire school).\\nGrote Markt 21, 2800 Mechelen.', 'timestamp': datetime.datetime(2018, 10, 23, 5, 45), 'url': 'https://www.mechelen.be/schrijf-je-kind-tijdig-in-op-school', 'id': 42}\n" + ] + } + ], + "source": [ + "print(next(dataset_iterator))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "fc9519d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of files 0 after adding /data/oscar_nl_cleaned2\n" + ] + }, + { + "ename": "NameError", + "evalue": "name 'data_dir' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_371965/3501862563.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;31m#add_jsonlines_dir(\"/data/nu_cleaned_idtextfmt\",\"*.gz\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0madd_jsonlines_dir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{data_dir}/c4_cleaned\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"*73*.gz\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'data_dir' is not defined" ] } ], @@ -200,9 +1469,13 @@ " print(f\"Number of files {len(data_files)} after adding {path}\")\n", " \n", "#add_jsonlines_dir(\"/home/dat/subset_c4_cleannl\",\"*.gz\") \n", - "add_jsonlines_dir(\"/data/oscar_nl_cleaned\",\"*.gz\")\n", + "add_jsonlines_dir(\"/data/oscar_nl_cleaned2\",\"*.gz\")\n", "#add_jsonlines_dir(\"/data/nrc_cleaned_idtextfmt\",\"*.gz\")\n", "#add_jsonlines_dir(\"/data/nu_cleaned_idtextfmt\",\"*.gz\")\n", + "\n", + "add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*73*.gz\")\n", + "\n", + " \n", "random.Random(SEED).shuffle(data_files)\n", "total = len(data_files)\n", "val_size = int(0.05 * total)\n", @@ -218,6 +1491,19 @@ "assert list(set(train) & set(val)) == [], 'train overlaps with test'\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "71cac0b7", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Dataset\n", + "datasets['train'] = Dataset.from_file(\"/home/dat/.cache/huggingface/datasets/json/default-3eb349358dcf6436/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723/json-train.arrow\") \n", + "datasets['validation'] = Dataset.from_file(\"/home/dat/.cache/huggingface/datasets/json/default-3eb349358dcf6436/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723/json-validation.arrow\") \n", + "\n" + ] + }, { "cell_type": "code", "execution_count": 41,