hynky HF staff commited on
Commit
77ba698
1 Parent(s): 75aa635

add noteboks

Browse files
notebooks/ablation_fw_edu.ipynb ADDED
@@ -0,0 +1,2015 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## Fetch the data from the hub"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 35,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import os\n",
17
+ "import itertools\n",
18
+ "import pandas as pd\n",
19
+ "from concurrent.futures import ThreadPoolExecutor\n",
20
+ "from tqdm import tqdm\n",
21
+ "import itertools\n",
22
+ "import huggingface_hub\n",
23
+ "from tensorboard.backend.event_processing.event_accumulator import EventAccumulator\n",
24
+ "from huggingface_hub.utils import EntryNotFoundError"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 36,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "def step_element_match(step_to_check, step_element):\n",
34
+ " step_element = step_element.strip().replace(\" \", \"\")\n",
35
+ " if \"-\" in step_element:\n",
36
+ " a, b = step_element.split(\"-\")\n",
37
+ " c = None\n",
38
+ " if \"%\" in b:\n",
39
+ " b, c = b.split(\"%\")\n",
40
+ " return (int(a) <= step_to_check <= int(b) and\n",
41
+ " (c is None or (step_to_check - int(a)) % int(c) == 0))\n",
42
+ " elif \"%\" in step_element:\n",
43
+ " return step_to_check % int(step_element[1:]) == 0\n",
44
+ " else:\n",
45
+ " return step_to_check == int(step_element)\n",
46
+ " \n",
47
+ "def fetch_run_results_simple(repo_name, runs_to_fetch, steps_to_fetch, prefix, agg_score_columns, column_name,\n",
48
+ " seed_merge_method, oauth_token=None, prefix_file=None):\n",
49
+ " if not runs_to_fetch:\n",
50
+ " return\n",
51
+ "\n",
52
+ " def fetch_run_files(run_to_fetch):\n",
53
+ " def filename_to_steps_timestamp(fn):\n",
54
+ " step, ts = fn.split(\"_events.out.tfevents.\")\n",
55
+ " return int(step[-7:]), int(ts[:ts.index(\".\")])\n",
56
+ "\n",
57
+ " run_to_fetch += \"_e\"\n",
58
+ " try:\n",
59
+ " eval_repo_file_names = [f.path for f in\n",
60
+ " huggingface_hub.list_repo_tree(repo_name, run_to_fetch, expand=False,\n",
61
+ " token=oauth_token) if\n",
62
+ " \"_events.out.tfevents\" in f.path]\n",
63
+ " except EntryNotFoundError:\n",
64
+ " return []\n",
65
+ "\n",
66
+ " eval_files = [os.path.relpath(f, run_to_fetch) for f in eval_repo_file_names]\n",
67
+ " timestamps = {}\n",
68
+ " for fn in eval_files:\n",
69
+ " steps, ts = filename_to_steps_timestamp(fn)\n",
70
+ " if steps not in timestamps or timestamps[steps][0] < ts:\n",
71
+ " timestamps[steps] = ts, fn\n",
72
+ "\n",
73
+ " results = []\n",
74
+ " for eval_file, repofile in zip(eval_files, eval_repo_file_names):\n",
75
+ " steps, ts = filename_to_steps_timestamp(eval_file)\n",
76
+ " if not any(step_element_match(steps, step_el) for step_el in steps_to_fetch.split(\",\")):\n",
77
+ " continue\n",
78
+ " if timestamps[steps][1] == eval_file:\n",
79
+ " results.append((run_to_fetch, steps, repofile))\n",
80
+ " return results\n",
81
+ "\n",
82
+ " def load_run_file(data):\n",
83
+ " run_to_fetch, steps, repofile = data\n",
84
+ " loader = EventAccumulator(huggingface_hub.hf_hub_download(repo_name, repofile, token=oauth_token))\n",
85
+ " loader.Reload()\n",
86
+ " runname = run_to_fetch.removeprefix(prefix).removesuffix(\"-_e\")\n",
87
+ " column_names = [\"runname\", \"seed\", \"steps\", \"agg_score\"]\n",
88
+ " column_values = [runname, 0, steps, 0.0]\n",
89
+ "\n",
90
+ " for tag in loader.Tags()['scalars']:\n",
91
+ " if not \"stderr\" in tag and tag.split('/')[0] == 'e':\n",
92
+ " event_list = loader.Scalars(tag)\n",
93
+ " tag = tag.split('/')\n",
94
+ " column_names.append(f\"{tag[1]}/{tag[2]}\")\n",
95
+ " column_values.append(event_list[-1].value)\n",
96
+ "\n",
97
+ " return pd.DataFrame([column_values], columns=column_names)\n",
98
+ "\n",
99
+ " with ThreadPoolExecutor() as pool:\n",
100
+ " run_files = list(itertools.chain.from_iterable(\n",
101
+ " tqdm(pool.map(fetch_run_files, runs_to_fetch), total=len(runs_to_fetch), desc=\"Fetching datafiles...\")))\n",
102
+ " df = pd.concat(tqdm(pool.map(load_run_file, run_files), total=len(run_files), desc=\"Loading evals data...\"))\n",
103
+ "\n",
104
+ " cols_to_avg = [col for col in agg_score_columns if col in df.columns]\n",
105
+ " if cols_to_avg:\n",
106
+ " df['agg_score'] = df[cols_to_avg].mean(axis=1)\n",
107
+ "\n",
108
+ " prefix_file = prefix_file + \"_\" if prefix_file else \"\"\n",
109
+ " df.to_csv(f\"{prefix_file}{repo_name.split('/')[-1]}_metrics.csv\", index=False)\n",
110
+ " print(f\"Metrics saved to {repo_name.split('/')[-1]}_metrics.csv\")\n",
111
+ "\n",
112
+ " return df"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": 37,
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "name": "stderr",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "Fetching datafiles...: 100%|██████████| 1/1 [00:00<00:00, 1.77it/s]\n",
125
+ "Loading evals data...: 100%|██████████| 82/82 [00:13<00:00, 5.90it/s]"
126
+ ]
127
+ },
128
+ {
129
+ "name": "stdout",
130
+ "output_type": "stream",
131
+ "text": [
132
+ "Metrics saved to loubna-edu_fw_ablations_metrics.csv\n"
133
+ ]
134
+ },
135
+ {
136
+ "name": "stderr",
137
+ "output_type": "stream",
138
+ "text": [
139
+ "\n"
140
+ ]
141
+ },
142
+ {
143
+ "data": {
144
+ "text/html": [
145
+ "<div>\n",
146
+ "<style scoped>\n",
147
+ " .dataframe tbody tr th:only-of-type {\n",
148
+ " vertical-align: middle;\n",
149
+ " }\n",
150
+ "\n",
151
+ " .dataframe tbody tr th {\n",
152
+ " vertical-align: top;\n",
153
+ " }\n",
154
+ "\n",
155
+ " .dataframe thead th {\n",
156
+ " text-align: right;\n",
157
+ " }\n",
158
+ "</style>\n",
159
+ "<table border=\"1\" class=\"dataframe\">\n",
160
+ " <thead>\n",
161
+ " <tr style=\"text-align: right;\">\n",
162
+ " <th></th>\n",
163
+ " <th>runname</th>\n",
164
+ " <th>seed</th>\n",
165
+ " <th>steps</th>\n",
166
+ " <th>agg_score</th>\n",
167
+ " <th>commonsense_qa/acc</th>\n",
168
+ " <th>commonsense_qa/acc_norm</th>\n",
169
+ " <th>hellaswag/acc</th>\n",
170
+ " <th>hellaswag/acc_norm</th>\n",
171
+ " <th>openbookqa/acc</th>\n",
172
+ " <th>openbookqa/acc_norm</th>\n",
173
+ " <th>...</th>\n",
174
+ " <th>siqa/acc</th>\n",
175
+ " <th>siqa/acc_norm</th>\n",
176
+ " <th>winogrande/acc</th>\n",
177
+ " <th>winogrande/acc_norm</th>\n",
178
+ " <th>all/acc</th>\n",
179
+ " <th>all/acc_norm</th>\n",
180
+ " <th>arc/acc</th>\n",
181
+ " <th>arc/acc_norm</th>\n",
182
+ " <th>mmlu/acc</th>\n",
183
+ " <th>mmlu/acc_norm</th>\n",
184
+ " </tr>\n",
185
+ " </thead>\n",
186
+ " <tbody>\n",
187
+ " <tr>\n",
188
+ " <th>0</th>\n",
189
+ " <td>edu_fineweb_350b_tokens-seed-1</td>\n",
190
+ " <td>0</td>\n",
191
+ " <td>2000</td>\n",
192
+ " <td>0.390326</td>\n",
193
+ " <td>0.284</td>\n",
194
+ " <td>0.283</td>\n",
195
+ " <td>0.314</td>\n",
196
+ " <td>0.325</td>\n",
197
+ " <td>0.164</td>\n",
198
+ " <td>0.296</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>0.362</td>\n",
201
+ " <td>0.406</td>\n",
202
+ " <td>0.511</td>\n",
203
+ " <td>0.511</td>\n",
204
+ " <td>0.279674</td>\n",
205
+ " <td>0.299162</td>\n",
206
+ " <td>0.3795</td>\n",
207
+ " <td>0.3850</td>\n",
208
+ " <td>0.265997</td>\n",
209
+ " <td>0.284605</td>\n",
210
+ " </tr>\n",
211
+ " <tr>\n",
212
+ " <th>0</th>\n",
213
+ " <td>edu_fineweb_350b_tokens-seed-1</td>\n",
214
+ " <td>0</td>\n",
215
+ " <td>4000</td>\n",
216
+ " <td>0.414680</td>\n",
217
+ " <td>0.322</td>\n",
218
+ " <td>0.307</td>\n",
219
+ " <td>0.343</td>\n",
220
+ " <td>0.395</td>\n",
221
+ " <td>0.196</td>\n",
222
+ " <td>0.320</td>\n",
223
+ " <td>...</td>\n",
224
+ " <td>0.371</td>\n",
225
+ " <td>0.388</td>\n",
226
+ " <td>0.518</td>\n",
227
+ " <td>0.495</td>\n",
228
+ " <td>0.290613</td>\n",
229
+ " <td>0.312593</td>\n",
230
+ " <td>0.4215</td>\n",
231
+ " <td>0.4285</td>\n",
232
+ " <td>0.274401</td>\n",
233
+ " <td>0.295939</td>\n",
234
+ " </tr>\n",
235
+ " <tr>\n",
236
+ " <th>0</th>\n",
237
+ " <td>edu_fineweb_350b_tokens-seed-1</td>\n",
238
+ " <td>0</td>\n",
239
+ " <td>6000</td>\n",
240
+ " <td>0.428390</td>\n",
241
+ " <td>0.319</td>\n",
242
+ " <td>0.311</td>\n",
243
+ " <td>0.372</td>\n",
244
+ " <td>0.431</td>\n",
245
+ " <td>0.202</td>\n",
246
+ " <td>0.352</td>\n",
247
+ " <td>...</td>\n",
248
+ " <td>0.373</td>\n",
249
+ " <td>0.392</td>\n",
250
+ " <td>0.520</td>\n",
251
+ " <td>0.519</td>\n",
252
+ " <td>0.303980</td>\n",
253
+ " <td>0.323323</td>\n",
254
+ " <td>0.4315</td>\n",
255
+ " <td>0.4460</td>\n",
256
+ " <td>0.288591</td>\n",
257
+ " <td>0.306123</td>\n",
258
+ " </tr>\n",
259
+ " <tr>\n",
260
+ " <th>0</th>\n",
261
+ " <td>edu_fineweb_350b_tokens-seed-1</td>\n",
262
+ " <td>0</td>\n",
263
+ " <td>8000</td>\n",
264
+ " <td>0.443615</td>\n",
265
+ " <td>0.340</td>\n",
266
+ " <td>0.311</td>\n",
267
+ " <td>0.379</td>\n",
268
+ " <td>0.463</td>\n",
269
+ " <td>0.204</td>\n",
270
+ " <td>0.360</td>\n",
271
+ " <td>...</td>\n",
272
+ " <td>0.384</td>\n",
273
+ " <td>0.404</td>\n",
274
+ " <td>0.517</td>\n",
275
+ " <td>0.517</td>\n",
276
+ " <td>0.315148</td>\n",
277
+ " <td>0.333284</td>\n",
278
+ " <td>0.4630</td>\n",
279
+ " <td>0.4790</td>\n",
280
+ " <td>0.299186</td>\n",
281
+ " <td>0.314921</td>\n",
282
+ " </tr>\n",
283
+ " <tr>\n",
284
+ " <th>0</th>\n",
285
+ " <td>edu_fineweb_350b_tokens-seed-1</td>\n",
286
+ " <td>0</td>\n",
287
+ " <td>10000</td>\n",
288
+ " <td>0.441457</td>\n",
289
+ " <td>0.346</td>\n",
290
+ " <td>0.317</td>\n",
291
+ " <td>0.390</td>\n",
292
+ " <td>0.454</td>\n",
293
+ " <td>0.222</td>\n",
294
+ " <td>0.364</td>\n",
295
+ " <td>...</td>\n",
296
+ " <td>0.366</td>\n",
297
+ " <td>0.395</td>\n",
298
+ " <td>0.514</td>\n",
299
+ " <td>0.506</td>\n",
300
+ " <td>0.318935</td>\n",
301
+ " <td>0.335419</td>\n",
302
+ " <td>0.4890</td>\n",
303
+ " <td>0.4820</td>\n",
304
+ " <td>0.302189</td>\n",
305
+ " <td>0.317653</td>\n",
306
+ " </tr>\n",
307
+ " <tr>\n",
308
+ " <th>...</th>\n",
309
+ " <td>...</td>\n",
310
+ " <td>...</td>\n",
311
+ " <td>...</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>...</td>\n",
314
+ " <td>...</td>\n",
315
+ " <td>...</td>\n",
316
+ " <td>...</td>\n",
317
+ " <td>...</td>\n",
318
+ " <td>...</td>\n",
319
+ " <td>...</td>\n",
320
+ " <td>...</td>\n",
321
+ " <td>...</td>\n",
322
+ " <td>...</td>\n",
323
+ " <td>...</td>\n",
324
+ " <td>...</td>\n",
325
+ " <td>...</td>\n",
326
+ " <td>...</td>\n",
327
+ " <td>...</td>\n",
328
+ " <td>...</td>\n",
329
+ " <td>...</td>\n",
330
+ " </tr>\n",
331
+ " <tr>\n",
332
+ " <th>0</th>\n",
333
+ " <td>edu_fineweb_350b_tokens-seed-1</td>\n",
334
+ " <td>0</td>\n",
335
+ " <td>160000</td>\n",
336
+ " <td>0.507129</td>\n",
337
+ " <td>0.430</td>\n",
338
+ " <td>0.359</td>\n",
339
+ " <td>0.473</td>\n",
340
+ " <td>0.593</td>\n",
341
+ " <td>0.282</td>\n",
342
+ " <td>0.418</td>\n",
343
+ " <td>...</td>\n",
344
+ " <td>0.392</td>\n",
345
+ " <td>0.402</td>\n",
346
+ " <td>0.576</td>\n",
347
+ " <td>0.575</td>\n",
348
+ " <td>0.369137</td>\n",
349
+ " <td>0.393898</td>\n",
350
+ " <td>0.5670</td>\n",
351
+ " <td>0.5725</td>\n",
352
+ " <td>0.350226</td>\n",
353
+ " <td>0.374533</td>\n",
354
+ " </tr>\n",
355
+ " <tr>\n",
356
+ " <th>0</th>\n",
357
+ " <td>edu_fineweb_350b_tokens-seed-1</td>\n",
358
+ " <td>0</td>\n",
359
+ " <td>162000</td>\n",
360
+ " <td>0.509118</td>\n",
361
+ " <td>0.416</td>\n",
362
+ " <td>0.367</td>\n",
363
+ " <td>0.474</td>\n",
364
+ " <td>0.592</td>\n",
365
+ " <td>0.288</td>\n",
366
+ " <td>0.408</td>\n",
367
+ " <td>...</td>\n",
368
+ " <td>0.390</td>\n",
369
+ " <td>0.409</td>\n",
370
+ " <td>0.572</td>\n",
371
+ " <td>0.577</td>\n",
372
+ " <td>0.367420</td>\n",
373
+ " <td>0.392861</td>\n",
374
+ " <td>0.5720</td>\n",
375
+ " <td>0.5780</td>\n",
376
+ " <td>0.348268</td>\n",
377
+ " <td>0.372947</td>\n",
378
+ " </tr>\n",
379
+ " <tr>\n",
380
+ " <th>0</th>\n",
381
+ " <td>edu_fineweb_350b_tokens-seed-1</td>\n",
382
+ " <td>0</td>\n",
383
+ " <td>164000</td>\n",
384
+ " <td>0.507843</td>\n",
385
+ " <td>0.416</td>\n",
386
+ " <td>0.365</td>\n",
387
+ " <td>0.467</td>\n",
388
+ " <td>0.591</td>\n",
389
+ " <td>0.276</td>\n",
390
+ " <td>0.408</td>\n",
391
+ " <td>...</td>\n",
392
+ " <td>0.395</td>\n",
393
+ " <td>0.406</td>\n",
394
+ " <td>0.576</td>\n",
395
+ " <td>0.580</td>\n",
396
+ " <td>0.368319</td>\n",
397
+ " <td>0.392000</td>\n",
398
+ " <td>0.5635</td>\n",
399
+ " <td>0.5715</td>\n",
400
+ " <td>0.349943</td>\n",
401
+ " <td>0.372246</td>\n",
402
+ " </tr>\n",
403
+ " <tr>\n",
404
+ " <th>0</th>\n",
405
+ " <td>edu_fineweb_350b_tokens-seed-1</td>\n",
406
+ " <td>0</td>\n",
407
+ " <td>166000</td>\n",
408
+ " <td>0.508308</td>\n",
409
+ " <td>0.415</td>\n",
410
+ " <td>0.364</td>\n",
411
+ " <td>0.472</td>\n",
412
+ " <td>0.593</td>\n",
413
+ " <td>0.282</td>\n",
414
+ " <td>0.414</td>\n",
415
+ " <td>...</td>\n",
416
+ " <td>0.401</td>\n",
417
+ " <td>0.408</td>\n",
418
+ " <td>0.575</td>\n",
419
+ " <td>0.570</td>\n",
420
+ " <td>0.370593</td>\n",
421
+ " <td>0.393176</td>\n",
422
+ " <td>0.5640</td>\n",
423
+ " <td>0.5760</td>\n",
424
+ " <td>0.352203</td>\n",
425
+ " <td>0.373463</td>\n",
426
+ " </tr>\n",
427
+ " <tr>\n",
428
+ " <th>0</th>\n",
429
+ " <td>edu_fineweb_350b_tokens-seed-1</td>\n",
430
+ " <td>0</td>\n",
431
+ " <td>167000</td>\n",
432
+ " <td>0.509494</td>\n",
433
+ " <td>0.429</td>\n",
434
+ " <td>0.362</td>\n",
435
+ " <td>0.472</td>\n",
436
+ " <td>0.597</td>\n",
437
+ " <td>0.290</td>\n",
438
+ " <td>0.418</td>\n",
439
+ " <td>...</td>\n",
440
+ " <td>0.395</td>\n",
441
+ " <td>0.404</td>\n",
442
+ " <td>0.582</td>\n",
443
+ " <td>0.578</td>\n",
444
+ " <td>0.369666</td>\n",
445
+ " <td>0.394136</td>\n",
446
+ " <td>0.5670</td>\n",
447
+ " <td>0.5735</td>\n",
448
+ " <td>0.350671</td>\n",
449
+ " <td>0.374453</td>\n",
450
+ " </tr>\n",
451
+ " </tbody>\n",
452
+ "</table>\n",
453
+ "<p>82 rows × 22 columns</p>\n",
454
+ "</div>"
455
+ ],
456
+ "text/plain": [
457
+ " runname seed steps agg_score \\\n",
458
+ "0 edu_fineweb_350b_tokens-seed-1 0 2000 0.390326 \n",
459
+ "0 edu_fineweb_350b_tokens-seed-1 0 4000 0.414680 \n",
460
+ "0 edu_fineweb_350b_tokens-seed-1 0 6000 0.428390 \n",
461
+ "0 edu_fineweb_350b_tokens-seed-1 0 8000 0.443615 \n",
462
+ "0 edu_fineweb_350b_tokens-seed-1 0 10000 0.441457 \n",
463
+ ".. ... ... ... ... \n",
464
+ "0 edu_fineweb_350b_tokens-seed-1 0 160000 0.507129 \n",
465
+ "0 edu_fineweb_350b_tokens-seed-1 0 162000 0.509118 \n",
466
+ "0 edu_fineweb_350b_tokens-seed-1 0 164000 0.507843 \n",
467
+ "0 edu_fineweb_350b_tokens-seed-1 0 166000 0.508308 \n",
468
+ "0 edu_fineweb_350b_tokens-seed-1 0 167000 0.509494 \n",
469
+ "\n",
470
+ " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n",
471
+ "0 0.284 0.283 0.314 \n",
472
+ "0 0.322 0.307 0.343 \n",
473
+ "0 0.319 0.311 0.372 \n",
474
+ "0 0.340 0.311 0.379 \n",
475
+ "0 0.346 0.317 0.390 \n",
476
+ ".. ... ... ... \n",
477
+ "0 0.430 0.359 0.473 \n",
478
+ "0 0.416 0.367 0.474 \n",
479
+ "0 0.416 0.365 0.467 \n",
480
+ "0 0.415 0.364 0.472 \n",
481
+ "0 0.429 0.362 0.472 \n",
482
+ "\n",
483
+ " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n",
484
+ "0 0.325 0.164 0.296 ... 0.362 \n",
485
+ "0 0.395 0.196 0.320 ... 0.371 \n",
486
+ "0 0.431 0.202 0.352 ... 0.373 \n",
487
+ "0 0.463 0.204 0.360 ... 0.384 \n",
488
+ "0 0.454 0.222 0.364 ... 0.366 \n",
489
+ ".. ... ... ... ... ... \n",
490
+ "0 0.593 0.282 0.418 ... 0.392 \n",
491
+ "0 0.592 0.288 0.408 ... 0.390 \n",
492
+ "0 0.591 0.276 0.408 ... 0.395 \n",
493
+ "0 0.593 0.282 0.414 ... 0.401 \n",
494
+ "0 0.597 0.290 0.418 ... 0.395 \n",
495
+ "\n",
496
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm all/acc \\\n",
497
+ "0 0.406 0.511 0.511 0.279674 \n",
498
+ "0 0.388 0.518 0.495 0.290613 \n",
499
+ "0 0.392 0.520 0.519 0.303980 \n",
500
+ "0 0.404 0.517 0.517 0.315148 \n",
501
+ "0 0.395 0.514 0.506 0.318935 \n",
502
+ ".. ... ... ... ... \n",
503
+ "0 0.402 0.576 0.575 0.369137 \n",
504
+ "0 0.409 0.572 0.577 0.367420 \n",
505
+ "0 0.406 0.576 0.580 0.368319 \n",
506
+ "0 0.408 0.575 0.570 0.370593 \n",
507
+ "0 0.404 0.582 0.578 0.369666 \n",
508
+ "\n",
509
+ " all/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
510
+ "0 0.299162 0.3795 0.3850 0.265997 0.284605 \n",
511
+ "0 0.312593 0.4215 0.4285 0.274401 0.295939 \n",
512
+ "0 0.323323 0.4315 0.4460 0.288591 0.306123 \n",
513
+ "0 0.333284 0.4630 0.4790 0.299186 0.314921 \n",
514
+ "0 0.335419 0.4890 0.4820 0.302189 0.317653 \n",
515
+ ".. ... ... ... ... ... \n",
516
+ "0 0.393898 0.5670 0.5725 0.350226 0.374533 \n",
517
+ "0 0.392861 0.5720 0.5780 0.348268 0.372947 \n",
518
+ "0 0.392000 0.5635 0.5715 0.349943 0.372246 \n",
519
+ "0 0.393176 0.5640 0.5760 0.352203 0.373463 \n",
520
+ "0 0.394136 0.5670 0.5735 0.350671 0.374453 \n",
521
+ "\n",
522
+ "[82 rows x 22 columns]"
523
+ ]
524
+ },
525
+ "execution_count": 37,
526
+ "metadata": {},
527
+ "output_type": "execute_result"
528
+ }
529
+ ],
530
+ "source": [
531
+ "token = os.getenv(\"HF_TOKEN\")\n",
532
+ "repo_name = \"HuggingFaceTB/loubna-edu_fw_ablations\"\n",
533
+ "runs_to_fetch = [\"tb/edu_fw_ablations-1p82G-edu_fineweb_350b_tokens-seed-1-\"]\n",
534
+ "steps_to_fetch = \"%1000\"\n",
535
+ "prefix = \"tb/edu_fw_ablations-1p82G-\"\n",
536
+ "metrics = ['commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
537
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
538
+ "agg_score_columns = metrics\n",
539
+ "column_name = \"agg_score\"\n",
540
+ "seed_merge_method = \"mean\"\n",
541
+ "oauth_token = token\n",
542
+ "\n",
543
+ "# runs_to_fetch = [prefix + run for run in runs_to_fetch]\n",
544
+ "fetch_run_results_simple(repo_name, runs_to_fetch, steps_to_fetch, prefix, agg_score_columns, column_name, seed_merge_method, oauth_token=token)"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "markdown",
549
+ "metadata": {},
550
+ "source": [
551
+ "## Plot the data"
552
+ ]
553
+ },
554
+ {
555
+ "cell_type": "markdown",
556
+ "metadata": {},
557
+ "source": [
558
+ "### Load csvs for FW and FW-Edu"
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": 38,
564
+ "metadata": {},
565
+ "outputs": [
566
+ {
567
+ "data": {
568
+ "text/html": [
569
+ "<div>\n",
570
+ "<style scoped>\n",
571
+ " .dataframe tbody tr th:only-of-type {\n",
572
+ " vertical-align: middle;\n",
573
+ " }\n",
574
+ "\n",
575
+ " .dataframe tbody tr th {\n",
576
+ " vertical-align: top;\n",
577
+ " }\n",
578
+ "\n",
579
+ " .dataframe thead th {\n",
580
+ " text-align: right;\n",
581
+ " }\n",
582
+ "</style>\n",
583
+ "<table border=\"1\" class=\"dataframe\">\n",
584
+ " <thead>\n",
585
+ " <tr style=\"text-align: right;\">\n",
586
+ " <th></th>\n",
587
+ " <th>runname</th>\n",
588
+ " <th>steps</th>\n",
589
+ " <th>agg_score</th>\n",
590
+ " <th>commonsense_qa/acc</th>\n",
591
+ " <th>commonsense_qa/acc_norm</th>\n",
592
+ " <th>hellaswag/acc</th>\n",
593
+ " <th>hellaswag/acc_norm</th>\n",
594
+ " <th>openbookqa/acc</th>\n",
595
+ " <th>openbookqa/acc_norm</th>\n",
596
+ " <th>piqa/acc</th>\n",
597
+ " <th>piqa/acc_norm</th>\n",
598
+ " <th>siqa/acc</th>\n",
599
+ " <th>siqa/acc_norm</th>\n",
600
+ " <th>winogrande/acc</th>\n",
601
+ " <th>winogrande/acc_norm</th>\n",
602
+ " <th>arc/acc</th>\n",
603
+ " <th>arc/acc_norm</th>\n",
604
+ " <th>mmlu/acc</th>\n",
605
+ " <th>mmlu/acc_norm</th>\n",
606
+ " </tr>\n",
607
+ " </thead>\n",
608
+ " <tbody>\n",
609
+ " <tr>\n",
610
+ " <th>0</th>\n",
611
+ " <td>FineWeb-Edu</td>\n",
612
+ " <td>2000</td>\n",
613
+ " <td>0.390326</td>\n",
614
+ " <td>0.284</td>\n",
615
+ " <td>0.283</td>\n",
616
+ " <td>0.314</td>\n",
617
+ " <td>0.325</td>\n",
618
+ " <td>0.164</td>\n",
619
+ " <td>0.296</td>\n",
620
+ " <td>0.623</td>\n",
621
+ " <td>0.632</td>\n",
622
+ " <td>0.362</td>\n",
623
+ " <td>0.406</td>\n",
624
+ " <td>0.511</td>\n",
625
+ " <td>0.511</td>\n",
626
+ " <td>0.3795</td>\n",
627
+ " <td>0.3850</td>\n",
628
+ " <td>0.265997</td>\n",
629
+ " <td>0.284605</td>\n",
630
+ " </tr>\n",
631
+ " <tr>\n",
632
+ " <th>1</th>\n",
633
+ " <td>FineWeb-Edu</td>\n",
634
+ " <td>4000</td>\n",
635
+ " <td>0.414680</td>\n",
636
+ " <td>0.322</td>\n",
637
+ " <td>0.307</td>\n",
638
+ " <td>0.343</td>\n",
639
+ " <td>0.395</td>\n",
640
+ " <td>0.196</td>\n",
641
+ " <td>0.320</td>\n",
642
+ " <td>0.656</td>\n",
643
+ " <td>0.688</td>\n",
644
+ " <td>0.371</td>\n",
645
+ " <td>0.388</td>\n",
646
+ " <td>0.518</td>\n",
647
+ " <td>0.495</td>\n",
648
+ " <td>0.4215</td>\n",
649
+ " <td>0.4285</td>\n",
650
+ " <td>0.274401</td>\n",
651
+ " <td>0.295939</td>\n",
652
+ " </tr>\n",
653
+ " <tr>\n",
654
+ " <th>2</th>\n",
655
+ " <td>FineWeb-Edu</td>\n",
656
+ " <td>6000</td>\n",
657
+ " <td>0.428390</td>\n",
658
+ " <td>0.319</td>\n",
659
+ " <td>0.311</td>\n",
660
+ " <td>0.372</td>\n",
661
+ " <td>0.431</td>\n",
662
+ " <td>0.202</td>\n",
663
+ " <td>0.352</td>\n",
664
+ " <td>0.660</td>\n",
665
+ " <td>0.670</td>\n",
666
+ " <td>0.373</td>\n",
667
+ " <td>0.392</td>\n",
668
+ " <td>0.520</td>\n",
669
+ " <td>0.519</td>\n",
670
+ " <td>0.4315</td>\n",
671
+ " <td>0.4460</td>\n",
672
+ " <td>0.288591</td>\n",
673
+ " <td>0.306123</td>\n",
674
+ " </tr>\n",
675
+ " <tr>\n",
676
+ " <th>3</th>\n",
677
+ " <td>FineWeb-Edu</td>\n",
678
+ " <td>8000</td>\n",
679
+ " <td>0.443615</td>\n",
680
+ " <td>0.340</td>\n",
681
+ " <td>0.311</td>\n",
682
+ " <td>0.379</td>\n",
683
+ " <td>0.463</td>\n",
684
+ " <td>0.204</td>\n",
685
+ " <td>0.360</td>\n",
686
+ " <td>0.681</td>\n",
687
+ " <td>0.700</td>\n",
688
+ " <td>0.384</td>\n",
689
+ " <td>0.404</td>\n",
690
+ " <td>0.517</td>\n",
691
+ " <td>0.517</td>\n",
692
+ " <td>0.4630</td>\n",
693
+ " <td>0.4790</td>\n",
694
+ " <td>0.299186</td>\n",
695
+ " <td>0.314921</td>\n",
696
+ " </tr>\n",
697
+ " <tr>\n",
698
+ " <th>4</th>\n",
699
+ " <td>FineWeb-Edu</td>\n",
700
+ " <td>10000</td>\n",
701
+ " <td>0.441457</td>\n",
702
+ " <td>0.346</td>\n",
703
+ " <td>0.317</td>\n",
704
+ " <td>0.390</td>\n",
705
+ " <td>0.454</td>\n",
706
+ " <td>0.222</td>\n",
707
+ " <td>0.364</td>\n",
708
+ " <td>0.690</td>\n",
709
+ " <td>0.696</td>\n",
710
+ " <td>0.366</td>\n",
711
+ " <td>0.395</td>\n",
712
+ " <td>0.514</td>\n",
713
+ " <td>0.506</td>\n",
714
+ " <td>0.4890</td>\n",
715
+ " <td>0.4820</td>\n",
716
+ " <td>0.302189</td>\n",
717
+ " <td>0.317653</td>\n",
718
+ " </tr>\n",
719
+ " </tbody>\n",
720
+ "</table>\n",
721
+ "</div>"
722
+ ],
723
+ "text/plain": [
724
+ " runname steps agg_score commonsense_qa/acc commonsense_qa/acc_norm \\\n",
725
+ "0 FineWeb-Edu 2000 0.390326 0.284 0.283 \n",
726
+ "1 FineWeb-Edu 4000 0.414680 0.322 0.307 \n",
727
+ "2 FineWeb-Edu 6000 0.428390 0.319 0.311 \n",
728
+ "3 FineWeb-Edu 8000 0.443615 0.340 0.311 \n",
729
+ "4 FineWeb-Edu 10000 0.441457 0.346 0.317 \n",
730
+ "\n",
731
+ " hellaswag/acc hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm \\\n",
732
+ "0 0.314 0.325 0.164 0.296 \n",
733
+ "1 0.343 0.395 0.196 0.320 \n",
734
+ "2 0.372 0.431 0.202 0.352 \n",
735
+ "3 0.379 0.463 0.204 0.360 \n",
736
+ "4 0.390 0.454 0.222 0.364 \n",
737
+ "\n",
738
+ " piqa/acc piqa/acc_norm siqa/acc siqa/acc_norm winogrande/acc \\\n",
739
+ "0 0.623 0.632 0.362 0.406 0.511 \n",
740
+ "1 0.656 0.688 0.371 0.388 0.518 \n",
741
+ "2 0.660 0.670 0.373 0.392 0.520 \n",
742
+ "3 0.681 0.700 0.384 0.404 0.517 \n",
743
+ "4 0.690 0.696 0.366 0.395 0.514 \n",
744
+ "\n",
745
+ " winogrande/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
746
+ "0 0.511 0.3795 0.3850 0.265997 0.284605 \n",
747
+ "1 0.495 0.4215 0.4285 0.274401 0.295939 \n",
748
+ "2 0.519 0.4315 0.4460 0.288591 0.306123 \n",
749
+ "3 0.517 0.4630 0.4790 0.299186 0.314921 \n",
750
+ "4 0.506 0.4890 0.4820 0.302189 0.317653 "
751
+ ]
752
+ },
753
+ "execution_count": 38,
754
+ "metadata": {},
755
+ "output_type": "execute_result"
756
+ }
757
+ ],
758
+ "source": [
759
+ "import pandas as pd\n",
760
+ "\n",
761
+ "# load guilherme csv with all the FW runs\n",
762
+ "df = pd.read_csv(\"../src_data/eval_results.csv\")\n",
763
+ "\n",
764
+ "# load FineWeb-Edu csv\n",
765
+ "df_2 = pd.read_csv(\"./loubna-edu_fw_ablations_metrics.csv\")\n",
766
+ "df_2['runname'] = df_2['runname'].replace('edu_fineweb_350b_tokens-seed-1', 'FineWeb-Edu', regex=True)\n",
767
+ "df_2.drop([\"seed\", \"all/acc\", \"all/acc_norm\"], axis=1, inplace=True)\n",
768
+ "df_2.head()"
769
+ ]
770
+ },
771
+ {
772
+ "cell_type": "code",
773
+ "execution_count": 39,
774
+ "metadata": {},
775
+ "outputs": [
776
+ {
777
+ "data": {
778
+ "text/html": [
779
+ "<div>\n",
780
+ "<style scoped>\n",
781
+ " .dataframe tbody tr th:only-of-type {\n",
782
+ " vertical-align: middle;\n",
783
+ " }\n",
784
+ "\n",
785
+ " .dataframe tbody tr th {\n",
786
+ " vertical-align: top;\n",
787
+ " }\n",
788
+ "\n",
789
+ " .dataframe thead th {\n",
790
+ " text-align: right;\n",
791
+ " }\n",
792
+ "</style>\n",
793
+ "<table border=\"1\" class=\"dataframe\">\n",
794
+ " <thead>\n",
795
+ " <tr style=\"text-align: right;\">\n",
796
+ " <th></th>\n",
797
+ " <th>runname</th>\n",
798
+ " <th>steps</th>\n",
799
+ " <th>agg_score</th>\n",
800
+ " <th>commonsense_qa/acc</th>\n",
801
+ " <th>commonsense_qa/acc_norm</th>\n",
802
+ " <th>hellaswag/acc</th>\n",
803
+ " <th>hellaswag/acc_norm</th>\n",
804
+ " <th>openbookqa/acc</th>\n",
805
+ " <th>openbookqa/acc_norm</th>\n",
806
+ " <th>piqa/acc</th>\n",
807
+ " <th>...</th>\n",
808
+ " <th>siqa/acc</th>\n",
809
+ " <th>siqa/acc_norm</th>\n",
810
+ " <th>winogrande/acc</th>\n",
811
+ " <th>winogrande/acc_norm</th>\n",
812
+ " <th>sciq/acc</th>\n",
813
+ " <th>sciq/acc_norm</th>\n",
814
+ " <th>arc/acc</th>\n",
815
+ " <th>arc/acc_norm</th>\n",
816
+ " <th>mmlu/acc</th>\n",
817
+ " <th>mmlu/acc_norm</th>\n",
818
+ " </tr>\n",
819
+ " </thead>\n",
820
+ " <tbody>\n",
821
+ " <tr>\n",
822
+ " <th>1253</th>\n",
823
+ " <td>FineWeb-Edu</td>\n",
824
+ " <td>160000</td>\n",
825
+ " <td>0.507129</td>\n",
826
+ " <td>0.430</td>\n",
827
+ " <td>0.359</td>\n",
828
+ " <td>0.473</td>\n",
829
+ " <td>0.593</td>\n",
830
+ " <td>0.282</td>\n",
831
+ " <td>0.418</td>\n",
832
+ " <td>0.744</td>\n",
833
+ " <td>...</td>\n",
834
+ " <td>0.392</td>\n",
835
+ " <td>0.402</td>\n",
836
+ " <td>0.576</td>\n",
837
+ " <td>0.575</td>\n",
838
+ " <td>NaN</td>\n",
839
+ " <td>NaN</td>\n",
840
+ " <td>0.5670</td>\n",
841
+ " <td>0.5725</td>\n",
842
+ " <td>0.350226</td>\n",
843
+ " <td>0.374533</td>\n",
844
+ " </tr>\n",
845
+ " <tr>\n",
846
+ " <th>1254</th>\n",
847
+ " <td>FineWeb-Edu</td>\n",
848
+ " <td>162000</td>\n",
849
+ " <td>0.509118</td>\n",
850
+ " <td>0.416</td>\n",
851
+ " <td>0.367</td>\n",
852
+ " <td>0.474</td>\n",
853
+ " <td>0.592</td>\n",
854
+ " <td>0.288</td>\n",
855
+ " <td>0.408</td>\n",
856
+ " <td>0.747</td>\n",
857
+ " <td>...</td>\n",
858
+ " <td>0.390</td>\n",
859
+ " <td>0.409</td>\n",
860
+ " <td>0.572</td>\n",
861
+ " <td>0.577</td>\n",
862
+ " <td>NaN</td>\n",
863
+ " <td>NaN</td>\n",
864
+ " <td>0.5720</td>\n",
865
+ " <td>0.5780</td>\n",
866
+ " <td>0.348268</td>\n",
867
+ " <td>0.372947</td>\n",
868
+ " </tr>\n",
869
+ " <tr>\n",
870
+ " <th>1255</th>\n",
871
+ " <td>FineWeb-Edu</td>\n",
872
+ " <td>164000</td>\n",
873
+ " <td>0.507843</td>\n",
874
+ " <td>0.416</td>\n",
875
+ " <td>0.365</td>\n",
876
+ " <td>0.467</td>\n",
877
+ " <td>0.591</td>\n",
878
+ " <td>0.276</td>\n",
879
+ " <td>0.408</td>\n",
880
+ " <td>0.737</td>\n",
881
+ " <td>...</td>\n",
882
+ " <td>0.395</td>\n",
883
+ " <td>0.406</td>\n",
884
+ " <td>0.576</td>\n",
885
+ " <td>0.580</td>\n",
886
+ " <td>NaN</td>\n",
887
+ " <td>NaN</td>\n",
888
+ " <td>0.5635</td>\n",
889
+ " <td>0.5715</td>\n",
890
+ " <td>0.349943</td>\n",
891
+ " <td>0.372246</td>\n",
892
+ " </tr>\n",
893
+ " <tr>\n",
894
+ " <th>1256</th>\n",
895
+ " <td>FineWeb-Edu</td>\n",
896
+ " <td>166000</td>\n",
897
+ " <td>0.508308</td>\n",
898
+ " <td>0.415</td>\n",
899
+ " <td>0.364</td>\n",
900
+ " <td>0.472</td>\n",
901
+ " <td>0.593</td>\n",
902
+ " <td>0.282</td>\n",
903
+ " <td>0.414</td>\n",
904
+ " <td>0.740</td>\n",
905
+ " <td>...</td>\n",
906
+ " <td>0.401</td>\n",
907
+ " <td>0.408</td>\n",
908
+ " <td>0.575</td>\n",
909
+ " <td>0.570</td>\n",
910
+ " <td>NaN</td>\n",
911
+ " <td>NaN</td>\n",
912
+ " <td>0.5640</td>\n",
913
+ " <td>0.5760</td>\n",
914
+ " <td>0.352203</td>\n",
915
+ " <td>0.373463</td>\n",
916
+ " </tr>\n",
917
+ " <tr>\n",
918
+ " <th>1257</th>\n",
919
+ " <td>FineWeb-Edu</td>\n",
920
+ " <td>167000</td>\n",
921
+ " <td>0.509494</td>\n",
922
+ " <td>0.429</td>\n",
923
+ " <td>0.362</td>\n",
924
+ " <td>0.472</td>\n",
925
+ " <td>0.597</td>\n",
926
+ " <td>0.290</td>\n",
927
+ " <td>0.418</td>\n",
928
+ " <td>0.738</td>\n",
929
+ " <td>...</td>\n",
930
+ " <td>0.395</td>\n",
931
+ " <td>0.404</td>\n",
932
+ " <td>0.582</td>\n",
933
+ " <td>0.578</td>\n",
934
+ " <td>NaN</td>\n",
935
+ " <td>NaN</td>\n",
936
+ " <td>0.5670</td>\n",
937
+ " <td>0.5735</td>\n",
938
+ " <td>0.350671</td>\n",
939
+ " <td>0.374453</td>\n",
940
+ " </tr>\n",
941
+ " </tbody>\n",
942
+ "</table>\n",
943
+ "<p>5 rows × 21 columns</p>\n",
944
+ "</div>"
945
+ ],
946
+ "text/plain": [
947
+ " runname steps agg_score commonsense_qa/acc \\\n",
948
+ "1253 FineWeb-Edu 160000 0.507129 0.430 \n",
949
+ "1254 FineWeb-Edu 162000 0.509118 0.416 \n",
950
+ "1255 FineWeb-Edu 164000 0.507843 0.416 \n",
951
+ "1256 FineWeb-Edu 166000 0.508308 0.415 \n",
952
+ "1257 FineWeb-Edu 167000 0.509494 0.429 \n",
953
+ "\n",
954
+ " commonsense_qa/acc_norm hellaswag/acc hellaswag/acc_norm \\\n",
955
+ "1253 0.359 0.473 0.593 \n",
956
+ "1254 0.367 0.474 0.592 \n",
957
+ "1255 0.365 0.467 0.591 \n",
958
+ "1256 0.364 0.472 0.593 \n",
959
+ "1257 0.362 0.472 0.597 \n",
960
+ "\n",
961
+ " openbookqa/acc openbookqa/acc_norm piqa/acc ... siqa/acc \\\n",
962
+ "1253 0.282 0.418 0.744 ... 0.392 \n",
963
+ "1254 0.288 0.408 0.747 ... 0.390 \n",
964
+ "1255 0.276 0.408 0.737 ... 0.395 \n",
965
+ "1256 0.282 0.414 0.740 ... 0.401 \n",
966
+ "1257 0.290 0.418 0.738 ... 0.395 \n",
967
+ "\n",
968
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
969
+ "1253 0.402 0.576 0.575 NaN \n",
970
+ "1254 0.409 0.572 0.577 NaN \n",
971
+ "1255 0.406 0.576 0.580 NaN \n",
972
+ "1256 0.408 0.575 0.570 NaN \n",
973
+ "1257 0.404 0.582 0.578 NaN \n",
974
+ "\n",
975
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
976
+ "1253 NaN 0.5670 0.5725 0.350226 0.374533 \n",
977
+ "1254 NaN 0.5720 0.5780 0.348268 0.372947 \n",
978
+ "1255 NaN 0.5635 0.5715 0.349943 0.372246 \n",
979
+ "1256 NaN 0.5640 0.5760 0.352203 0.373463 \n",
980
+ "1257 NaN 0.5670 0.5735 0.350671 0.374453 \n",
981
+ "\n",
982
+ "[5 rows x 21 columns]"
983
+ ]
984
+ },
985
+ "execution_count": 39,
986
+ "metadata": {},
987
+ "output_type": "execute_result"
988
+ }
989
+ ],
990
+ "source": [
991
+ "df_full = pd.concat([df, df_2], ignore_index=True)\n",
992
+ "df_full.tail()"
993
+ ]
994
+ },
995
+ {
996
+ "cell_type": "markdown",
997
+ "metadata": {},
998
+ "source": [
999
+ "### Guilherme-Board plot"
1000
+ ]
1001
+ },
1002
+ {
1003
+ "cell_type": "code",
1004
+ "execution_count": 45,
1005
+ "metadata": {},
1006
+ "outputs": [],
1007
+ "source": [
1008
+ "import os\n",
1009
+ "from matplotlib import pyplot as plt\n",
1010
+ "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
1011
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
1012
+ "\n",
1013
+ "def normalize_runname(runname):\n",
1014
+ " return runname.replace(\"/\", \"_\")\n",
1015
+ "\n",
1016
+ "grouped = (\n",
1017
+ " df_full.groupby([\"runname\", \"steps\"])\n",
1018
+ " .agg(\n",
1019
+ " {\n",
1020
+ " key: \"mean\" for key in metrics\n",
1021
+ " }\n",
1022
+ " )\n",
1023
+ " .reset_index()\n",
1024
+ ")\n",
1025
+ "\n",
1026
+ "file_id=\"../assets/data/plots/edu_ablations\"\n",
1027
+ "files = {}\n",
1028
+ "for metric in metrics:\n",
1029
+ " datas = {}\n",
1030
+ " for name, group in grouped.groupby(\"runname\"):\n",
1031
+ " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n",
1032
+ " group = group.set_index(\"steps\")\n",
1033
+ " rolling_avg = group\n",
1034
+ " # rolling_avg = group.rolling(window=5).mean()\n",
1035
+ " datas[name] = {\n",
1036
+ " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n",
1037
+ " \"y\": rolling_avg[metric].tolist(),\n",
1038
+ " \"label\": name,\n",
1039
+ " }\n",
1040
+ " # Sort the datata based on the steps\n",
1041
+ " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n",
1042
+ " # Create a folder\n",
1043
+ " os.makedirs(f\"{file_id}\", exist_ok=True)\n",
1044
+ " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n",
1045
+ " json.dump({\n",
1046
+ " \"data\": datas,\n",
1047
+ " \"layout\": {\n",
1048
+ " \"title\": {\n",
1049
+ " \"text\": \"FineWeb-Edu ablations\"\n",
1050
+ " },\n",
1051
+ " }\n",
1052
+ " }, f)\n",
1053
+ " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n",
1054
+ "# Create index\n",
1055
+ "with open(f\"{file_id}/index.json\", \"w\") as f:\n",
1056
+ " json.dump({\n",
1057
+ " \"files\": files,\n",
1058
+ " \"settings\": {\n",
1059
+ " \"defaultMetric\": \"agg_score\",\n",
1060
+ " \"slider\":{\"min\":0,\"max\":30,\"default\":5}\n",
1061
+ " }\n",
1062
+ " }, f)"
1063
+ ]
1064
+ },
1065
+ {
1066
+ "cell_type": "markdown",
1067
+ "metadata": {},
1068
+ "source": [
1069
+ "### Barplot"
1070
+ ]
1071
+ },
1072
+ {
1073
+ "cell_type": "code",
1074
+ "execution_count": null,
1075
+ "metadata": {},
1076
+ "outputs": [
1077
+ {
1078
+ "name": "stdout",
1079
+ "output_type": "stream",
1080
+ "text": [
1081
+ "Requirement already satisfied: kaleido in /Users/hynky/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages (0.2.1)\n"
1082
+ ]
1083
+ }
1084
+ ],
1085
+ "source": [
1086
+ "!pip install -U kaleido"
1087
+ ]
1088
+ },
1089
+ {
1090
+ "cell_type": "code",
1091
+ "execution_count": null,
1092
+ "metadata": {},
1093
+ "outputs": [],
1094
+ "source": [
1095
+ "%load_ext autoreload\n",
1096
+ "%autoreload 2"
1097
+ ]
1098
+ },
1099
+ {
1100
+ "cell_type": "code",
1101
+ "execution_count": 33,
1102
+ "metadata": {},
1103
+ "outputs": [
1104
+ {
1105
+ "name": "stdout",
1106
+ "output_type": "stream",
1107
+ "text": [
1108
+ "Plot saved to plots/edu-100k.png\n"
1109
+ ]
1110
+ }
1111
+ ],
1112
+ "source": [
1113
+ "import plotly.express as px\n",
1114
+ "from plotly.subplots import make_subplots\n",
1115
+ "import plotly.graph_objects as go\n",
1116
+ "\n",
1117
+ "import json\n",
1118
+ "\n",
1119
+ "BASELINES = {\n",
1120
+ " \"mmlu/acc_norm\": 0.25,\n",
1121
+ " \"arc/acc_norm\": 0.25,\n",
1122
+ " \"openbookqa/acc_norm\": 0.25,\n",
1123
+ " \"piqa/acc_norm\": 0.5,\n",
1124
+ " \"hellaswag/acc_norm\": 0.25,\n",
1125
+ " \"siqa/acc_norm\": 0.33,\n",
1126
+ " \"winogrande/acc_norm\": 0.5,\n",
1127
+ "}\n",
1128
+ "\n",
1129
+ "\n",
1130
+ "def normalize_run_name(run_name):\n",
1131
+ " return run_name.replace(\"/\", \"_\")\n",
1132
+ "\n",
1133
+ "\n",
1134
+ "def save_for_bar(dir_name, df, metrics, default_metric=\"mmlu/acc_norm\", xlabel=\"Dataset\", plot_name=\"plot name\", custom_layout={}, ranges={}):\n",
1135
+ " import os\n",
1136
+ " files = {}\n",
1137
+ " os.makedirs(f\"../assets/data/plots/{dir_name}\", exist_ok=True)\n",
1138
+ " for metric in metrics:\n",
1139
+ " data = {}\n",
1140
+ " for run_name in df[\"runname\"].unique():\n",
1141
+ " data[run_name] = {\n",
1142
+ " \"x\": [run_name],\n",
1143
+ " \"y\": df[df[\"runname\"] == run_name][metric].tolist(),\n",
1144
+ " \"label\": run_name,\n",
1145
+ " }\n",
1146
+ " file_name = f\"{normalize_run_name(metric)}.json\"\n",
1147
+ " files[metric] = {\"file\": f\"{file_name}\"}\n",
1148
+ " with open(f\"../assets/data/plots/{dir_name}/{file_name}\", \"w\") as f:\n",
1149
+ " json.dump({\n",
1150
+ " \"data\": data,\n",
1151
+ " \"layout\": {\n",
1152
+ " \"showlegend\": False,\n",
1153
+ " \"title\": {\n",
1154
+ " \"text\": plot_name,\n",
1155
+ " },\n",
1156
+ " \"xaxis\": {\n",
1157
+ " \"title\": {\n",
1158
+ " \"text\": xlabel,\n",
1159
+ " \"standoff\": 30\n",
1160
+ " },\n",
1161
+ " \"tickangle\": 30\n",
1162
+ " },\n",
1163
+ " \"yaxis\": {\n",
1164
+ " \"range\": ranges.get(metric, [0, 1])\n",
1165
+ " },\n",
1166
+ " \"margin\": {\n",
1167
+ " \"b\": 100\n",
1168
+ " },\n",
1169
+ " **custom_layout,\n",
1170
+ " }\n",
1171
+ " }, f)\n",
1172
+ " with open(f\"../assets/data/plots/{dir_name}/index.json\", \"w\") as f:\n",
1173
+ " json.dump({\n",
1174
+ " \"files\": files,\n",
1175
+ " \"settings\": {\n",
1176
+ " \"defaultMetric\": default_metric,\n",
1177
+ " \"slider\": None,\n",
1178
+ " \"autoSetXRange\": False,\n",
1179
+ " \"type\": \"bar\"\n",
1180
+ " }\n",
1181
+ " }, f)\n",
1182
+ " return files\n",
1183
+ "\n",
1184
+ "def plot_metric_comparison(df, step, metrics, plot_name, run_name_replacements=None, output_file='comparison_plot_percentages.png', default_metric=\"mmlu/acc_norm\", custom_layout={}):\n",
1185
+ " \"\"\"\n",
1186
+ " Plot a comparison of the given metrics across different runs at the specified step and save the plot.\n",
1187
+ " \"\"\"\n",
1188
+ " if run_name_replacements:\n",
1189
+ " df['runname'] = df['runname'].replace(run_name_replacements)\n",
1190
+ "\n",
1191
+ " df_filtered = df[df['steps'] == step]\n",
1192
+ "\n",
1193
+ " # Create subplots\n",
1194
+ "\n",
1195
+ "\n",
1196
+ " ranges = {}\n",
1197
+ " for i, metric in enumerate(metrics):\n",
1198
+ " yrange_start = BASELINES.get(metric, 0) * 0.9\n",
1199
+ " yrange_end = max(df_filtered[metric])\n",
1200
+ " # Adjust the end\n",
1201
+ " yrange_end = yrange_end + (yrange_end - yrange_start) * 0.2\n",
1202
+ " ranges[metric] = [yrange_start, yrange_end]\n",
1203
+ " \n",
1204
+ " file_name=f\"plots/{output_file}.png\"\n",
1205
+ " # fig.write_image(file_name)\n",
1206
+ " print(f\"Plot saved to {file_name}\")\n",
1207
+ "\n",
1208
+ " save_for_bar(output_file, df_filtered, metrics, default_metric, plot_name=plot_name, custom_layout=custom_layout, ranges=ranges)\n",
1209
+ "\n",
1210
+ "\n",
1211
+ "metrics = [\n",
1212
+ " \"mmlu/acc_norm\",\n",
1213
+ " \"arc/acc_norm\",\n",
1214
+ " \"openbookqa/acc_norm\",\n",
1215
+ " \"piqa/acc_norm\",\n",
1216
+ " \"hellaswag/acc_norm\",\n",
1217
+ " \"siqa/acc_norm\",\n",
1218
+ " \"winogrande/acc_norm\",\n",
1219
+ "]\n",
1220
+ "\n",
1221
+ "plot_metric_comparison(df_full, 100000, metrics, output_file=\"edu-100k\", plot_name=\"Evaluation results at 350B tokens\", run_name_replacements={\n",
1222
+ " \"FineWeb (ours)\": \"FineWeb\"\n",
1223
+ "})"
1224
+ ]
1225
+ },
1226
+ {
1227
+ "cell_type": "markdown",
1228
+ "metadata": {},
1229
+ "source": [
1230
+ "## Thresholds ablation"
1231
+ ]
1232
+ },
1233
+ {
1234
+ "cell_type": "code",
1235
+ "execution_count": 16,
1236
+ "metadata": {},
1237
+ "outputs": [
1238
+ {
1239
+ "data": {
1240
+ "text/html": [
1241
+ "<div>\n",
1242
+ "<style scoped>\n",
1243
+ " .dataframe tbody tr th:only-of-type {\n",
1244
+ " vertical-align: middle;\n",
1245
+ " }\n",
1246
+ "\n",
1247
+ " .dataframe tbody tr th {\n",
1248
+ " vertical-align: top;\n",
1249
+ " }\n",
1250
+ "\n",
1251
+ " .dataframe thead th {\n",
1252
+ " text-align: right;\n",
1253
+ " }\n",
1254
+ "</style>\n",
1255
+ "<table border=\"1\" class=\"dataframe\">\n",
1256
+ " <thead>\n",
1257
+ " <tr style=\"text-align: right;\">\n",
1258
+ " <th></th>\n",
1259
+ " <th>runname</th>\n",
1260
+ " <th>steps</th>\n",
1261
+ " <th>agg_score</th>\n",
1262
+ " <th>commonsense_qa/acc</th>\n",
1263
+ " <th>commonsense_qa/acc_norm</th>\n",
1264
+ " <th>hellaswag/acc</th>\n",
1265
+ " <th>hellaswag/acc_norm</th>\n",
1266
+ " <th>openbookqa/acc</th>\n",
1267
+ " <th>openbookqa/acc_norm</th>\n",
1268
+ " <th>piqa/acc</th>\n",
1269
+ " <th>...</th>\n",
1270
+ " <th>siqa/acc</th>\n",
1271
+ " <th>siqa/acc_norm</th>\n",
1272
+ " <th>winogrande/acc</th>\n",
1273
+ " <th>winogrande/acc_norm</th>\n",
1274
+ " <th>sciq/acc</th>\n",
1275
+ " <th>sciq/acc_norm</th>\n",
1276
+ " <th>arc/acc</th>\n",
1277
+ " <th>arc/acc_norm</th>\n",
1278
+ " <th>mmlu/acc</th>\n",
1279
+ " <th>mmlu/acc_norm</th>\n",
1280
+ " </tr>\n",
1281
+ " </thead>\n",
1282
+ " <tbody>\n",
1283
+ " <tr>\n",
1284
+ " <th>0</th>\n",
1285
+ " <td>C4</td>\n",
1286
+ " <td>0</td>\n",
1287
+ " <td>0.330893</td>\n",
1288
+ " <td>0.186</td>\n",
1289
+ " <td>0.233</td>\n",
1290
+ " <td>0.272</td>\n",
1291
+ " <td>0.258</td>\n",
1292
+ " <td>0.166</td>\n",
1293
+ " <td>0.286</td>\n",
1294
+ " <td>0.542</td>\n",
1295
+ " <td>...</td>\n",
1296
+ " <td>0.367</td>\n",
1297
+ " <td>0.362</td>\n",
1298
+ " <td>0.516</td>\n",
1299
+ " <td>0.497</td>\n",
1300
+ " <td>0.208</td>\n",
1301
+ " <td>0.202</td>\n",
1302
+ " <td>0.2195</td>\n",
1303
+ " <td>0.2510</td>\n",
1304
+ " <td>0.230294</td>\n",
1305
+ " <td>0.250147</td>\n",
1306
+ " </tr>\n",
1307
+ " <tr>\n",
1308
+ " <th>1</th>\n",
1309
+ " <td>C4</td>\n",
1310
+ " <td>1000</td>\n",
1311
+ " <td>0.355112</td>\n",
1312
+ " <td>0.229</td>\n",
1313
+ " <td>0.260</td>\n",
1314
+ " <td>0.286</td>\n",
1315
+ " <td>0.288</td>\n",
1316
+ " <td>0.128</td>\n",
1317
+ " <td>0.250</td>\n",
1318
+ " <td>0.614</td>\n",
1319
+ " <td>...</td>\n",
1320
+ " <td>0.351</td>\n",
1321
+ " <td>0.404</td>\n",
1322
+ " <td>0.519</td>\n",
1323
+ " <td>0.476</td>\n",
1324
+ " <td>0.565</td>\n",
1325
+ " <td>0.518</td>\n",
1326
+ " <td>0.2680</td>\n",
1327
+ " <td>0.2935</td>\n",
1328
+ " <td>0.238951</td>\n",
1329
+ " <td>0.250399</td>\n",
1330
+ " </tr>\n",
1331
+ " <tr>\n",
1332
+ " <th>2</th>\n",
1333
+ " <td>C4</td>\n",
1334
+ " <td>2000</td>\n",
1335
+ " <td>0.378435</td>\n",
1336
+ " <td>0.268</td>\n",
1337
+ " <td>0.278</td>\n",
1338
+ " <td>0.312</td>\n",
1339
+ " <td>0.330</td>\n",
1340
+ " <td>0.122</td>\n",
1341
+ " <td>0.276</td>\n",
1342
+ " <td>0.646</td>\n",
1343
+ " <td>...</td>\n",
1344
+ " <td>0.375</td>\n",
1345
+ " <td>0.400</td>\n",
1346
+ " <td>0.509</td>\n",
1347
+ " <td>0.500</td>\n",
1348
+ " <td>0.676</td>\n",
1349
+ " <td>0.577</td>\n",
1350
+ " <td>0.3065</td>\n",
1351
+ " <td>0.3230</td>\n",
1352
+ " <td>0.247275</td>\n",
1353
+ " <td>0.255482</td>\n",
1354
+ " </tr>\n",
1355
+ " <tr>\n",
1356
+ " <th>3</th>\n",
1357
+ " <td>C4</td>\n",
1358
+ " <td>3000</td>\n",
1359
+ " <td>0.387795</td>\n",
1360
+ " <td>0.280</td>\n",
1361
+ " <td>0.295</td>\n",
1362
+ " <td>0.331</td>\n",
1363
+ " <td>0.380</td>\n",
1364
+ " <td>0.152</td>\n",
1365
+ " <td>0.274</td>\n",
1366
+ " <td>0.660</td>\n",
1367
+ " <td>...</td>\n",
1368
+ " <td>0.376</td>\n",
1369
+ " <td>0.387</td>\n",
1370
+ " <td>0.512</td>\n",
1371
+ " <td>0.496</td>\n",
1372
+ " <td>0.725</td>\n",
1373
+ " <td>0.621</td>\n",
1374
+ " <td>0.3175</td>\n",
1375
+ " <td>0.3340</td>\n",
1376
+ " <td>0.254534</td>\n",
1377
+ " <td>0.267363</td>\n",
1378
+ " </tr>\n",
1379
+ " <tr>\n",
1380
+ " <th>4</th>\n",
1381
+ " <td>C4</td>\n",
1382
+ " <td>4000</td>\n",
1383
+ " <td>0.399320</td>\n",
1384
+ " <td>0.296</td>\n",
1385
+ " <td>0.298</td>\n",
1386
+ " <td>0.351</td>\n",
1387
+ " <td>0.406</td>\n",
1388
+ " <td>0.168</td>\n",
1389
+ " <td>0.282</td>\n",
1390
+ " <td>0.676</td>\n",
1391
+ " <td>...</td>\n",
1392
+ " <td>0.382</td>\n",
1393
+ " <td>0.404</td>\n",
1394
+ " <td>0.522</td>\n",
1395
+ " <td>0.503</td>\n",
1396
+ " <td>0.723</td>\n",
1397
+ " <td>0.618</td>\n",
1398
+ " <td>0.3255</td>\n",
1399
+ " <td>0.3470</td>\n",
1400
+ " <td>0.254762</td>\n",
1401
+ " <td>0.263563</td>\n",
1402
+ " </tr>\n",
1403
+ " <tr>\n",
1404
+ " <th>...</th>\n",
1405
+ " <td>...</td>\n",
1406
+ " <td>...</td>\n",
1407
+ " <td>...</td>\n",
1408
+ " <td>...</td>\n",
1409
+ " <td>...</td>\n",
1410
+ " <td>...</td>\n",
1411
+ " <td>...</td>\n",
1412
+ " <td>...</td>\n",
1413
+ " <td>...</td>\n",
1414
+ " <td>...</td>\n",
1415
+ " <td>...</td>\n",
1416
+ " <td>...</td>\n",
1417
+ " <td>...</td>\n",
1418
+ " <td>...</td>\n",
1419
+ " <td>...</td>\n",
1420
+ " <td>...</td>\n",
1421
+ " <td>...</td>\n",
1422
+ " <td>...</td>\n",
1423
+ " <td>...</td>\n",
1424
+ " <td>...</td>\n",
1425
+ " <td>...</td>\n",
1426
+ " </tr>\n",
1427
+ " <tr>\n",
1428
+ " <th>1171</th>\n",
1429
+ " <td>The Pile</td>\n",
1430
+ " <td>163000</td>\n",
1431
+ " <td>0.463789</td>\n",
1432
+ " <td>0.379</td>\n",
1433
+ " <td>0.349</td>\n",
1434
+ " <td>0.441</td>\n",
1435
+ " <td>0.555</td>\n",
1436
+ " <td>0.240</td>\n",
1437
+ " <td>0.366</td>\n",
1438
+ " <td>0.701</td>\n",
1439
+ " <td>...</td>\n",
1440
+ " <td>0.405</td>\n",
1441
+ " <td>0.388</td>\n",
1442
+ " <td>0.585</td>\n",
1443
+ " <td>0.560</td>\n",
1444
+ " <td>0.875</td>\n",
1445
+ " <td>0.820</td>\n",
1446
+ " <td>0.4475</td>\n",
1447
+ " <td>0.4450</td>\n",
1448
+ " <td>0.299378</td>\n",
1449
+ " <td>0.326313</td>\n",
1450
+ " </tr>\n",
1451
+ " <tr>\n",
1452
+ " <th>1172</th>\n",
1453
+ " <td>The Pile</td>\n",
1454
+ " <td>164000</td>\n",
1455
+ " <td>0.462758</td>\n",
1456
+ " <td>0.369</td>\n",
1457
+ " <td>0.344</td>\n",
1458
+ " <td>0.438</td>\n",
1459
+ " <td>0.552</td>\n",
1460
+ " <td>0.248</td>\n",
1461
+ " <td>0.348</td>\n",
1462
+ " <td>0.708</td>\n",
1463
+ " <td>...</td>\n",
1464
+ " <td>0.395</td>\n",
1465
+ " <td>0.401</td>\n",
1466
+ " <td>0.577</td>\n",
1467
+ " <td>0.567</td>\n",
1468
+ " <td>0.874</td>\n",
1469
+ " <td>0.806</td>\n",
1470
+ " <td>0.4465</td>\n",
1471
+ " <td>0.4355</td>\n",
1472
+ " <td>0.302083</td>\n",
1473
+ " <td>0.331563</td>\n",
1474
+ " </tr>\n",
1475
+ " <tr>\n",
1476
+ " <th>1173</th>\n",
1477
+ " <td>The Pile</td>\n",
1478
+ " <td>165000</td>\n",
1479
+ " <td>0.465026</td>\n",
1480
+ " <td>0.383</td>\n",
1481
+ " <td>0.350</td>\n",
1482
+ " <td>0.438</td>\n",
1483
+ " <td>0.553</td>\n",
1484
+ " <td>0.234</td>\n",
1485
+ " <td>0.352</td>\n",
1486
+ " <td>0.707</td>\n",
1487
+ " <td>...</td>\n",
1488
+ " <td>0.400</td>\n",
1489
+ " <td>0.401</td>\n",
1490
+ " <td>0.569</td>\n",
1491
+ " <td>0.556</td>\n",
1492
+ " <td>0.874</td>\n",
1493
+ " <td>0.811</td>\n",
1494
+ " <td>0.4460</td>\n",
1495
+ " <td>0.4455</td>\n",
1496
+ " <td>0.305193</td>\n",
1497
+ " <td>0.331708</td>\n",
1498
+ " </tr>\n",
1499
+ " <tr>\n",
1500
+ " <th>1174</th>\n",
1501
+ " <td>The Pile</td>\n",
1502
+ " <td>166000</td>\n",
1503
+ " <td>0.462349</td>\n",
1504
+ " <td>0.377</td>\n",
1505
+ " <td>0.346</td>\n",
1506
+ " <td>0.440</td>\n",
1507
+ " <td>0.557</td>\n",
1508
+ " <td>0.228</td>\n",
1509
+ " <td>0.346</td>\n",
1510
+ " <td>0.711</td>\n",
1511
+ " <td>...</td>\n",
1512
+ " <td>0.398</td>\n",
1513
+ " <td>0.398</td>\n",
1514
+ " <td>0.572</td>\n",
1515
+ " <td>0.558</td>\n",
1516
+ " <td>0.877</td>\n",
1517
+ " <td>0.811</td>\n",
1518
+ " <td>0.4525</td>\n",
1519
+ " <td>0.4385</td>\n",
1520
+ " <td>0.301952</td>\n",
1521
+ " <td>0.331295</td>\n",
1522
+ " </tr>\n",
1523
+ " <tr>\n",
1524
+ " <th>1175</th>\n",
1525
+ " <td>The Pile</td>\n",
1526
+ " <td>167000</td>\n",
1527
+ " <td>0.464539</td>\n",
1528
+ " <td>0.386</td>\n",
1529
+ " <td>0.354</td>\n",
1530
+ " <td>0.434</td>\n",
1531
+ " <td>0.557</td>\n",
1532
+ " <td>0.232</td>\n",
1533
+ " <td>0.356</td>\n",
1534
+ " <td>0.706</td>\n",
1535
+ " <td>...</td>\n",
1536
+ " <td>0.402</td>\n",
1537
+ " <td>0.402</td>\n",
1538
+ " <td>0.573</td>\n",
1539
+ " <td>0.559</td>\n",
1540
+ " <td>0.867</td>\n",
1541
+ " <td>0.802</td>\n",
1542
+ " <td>0.4475</td>\n",
1543
+ " <td>0.4375</td>\n",
1544
+ " <td>0.301934</td>\n",
1545
+ " <td>0.330810</td>\n",
1546
+ " </tr>\n",
1547
+ " </tbody>\n",
1548
+ "</table>\n",
1549
+ "<p>1176 rows × 21 columns</p>\n",
1550
+ "</div>"
1551
+ ],
1552
+ "text/plain": [
1553
+ " runname steps agg_score commonsense_qa/acc \\\n",
1554
+ "0 C4 0 0.330893 0.186 \n",
1555
+ "1 C4 1000 0.355112 0.229 \n",
1556
+ "2 C4 2000 0.378435 0.268 \n",
1557
+ "3 C4 3000 0.387795 0.280 \n",
1558
+ "4 C4 4000 0.399320 0.296 \n",
1559
+ "... ... ... ... ... \n",
1560
+ "1171 The Pile 163000 0.463789 0.379 \n",
1561
+ "1172 The Pile 164000 0.462758 0.369 \n",
1562
+ "1173 The Pile 165000 0.465026 0.383 \n",
1563
+ "1174 The Pile 166000 0.462349 0.377 \n",
1564
+ "1175 The Pile 167000 0.464539 0.386 \n",
1565
+ "\n",
1566
+ " commonsense_qa/acc_norm hellaswag/acc hellaswag/acc_norm \\\n",
1567
+ "0 0.233 0.272 0.258 \n",
1568
+ "1 0.260 0.286 0.288 \n",
1569
+ "2 0.278 0.312 0.330 \n",
1570
+ "3 0.295 0.331 0.380 \n",
1571
+ "4 0.298 0.351 0.406 \n",
1572
+ "... ... ... ... \n",
1573
+ "1171 0.349 0.441 0.555 \n",
1574
+ "1172 0.344 0.438 0.552 \n",
1575
+ "1173 0.350 0.438 0.553 \n",
1576
+ "1174 0.346 0.440 0.557 \n",
1577
+ "1175 0.354 0.434 0.557 \n",
1578
+ "\n",
1579
+ " openbookqa/acc openbookqa/acc_norm piqa/acc ... siqa/acc \\\n",
1580
+ "0 0.166 0.286 0.542 ... 0.367 \n",
1581
+ "1 0.128 0.250 0.614 ... 0.351 \n",
1582
+ "2 0.122 0.276 0.646 ... 0.375 \n",
1583
+ "3 0.152 0.274 0.660 ... 0.376 \n",
1584
+ "4 0.168 0.282 0.676 ... 0.382 \n",
1585
+ "... ... ... ... ... ... \n",
1586
+ "1171 0.240 0.366 0.701 ... 0.405 \n",
1587
+ "1172 0.248 0.348 0.708 ... 0.395 \n",
1588
+ "1173 0.234 0.352 0.707 ... 0.400 \n",
1589
+ "1174 0.228 0.346 0.711 ... 0.398 \n",
1590
+ "1175 0.232 0.356 0.706 ... 0.402 \n",
1591
+ "\n",
1592
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
1593
+ "0 0.362 0.516 0.497 0.208 \n",
1594
+ "1 0.404 0.519 0.476 0.565 \n",
1595
+ "2 0.400 0.509 0.500 0.676 \n",
1596
+ "3 0.387 0.512 0.496 0.725 \n",
1597
+ "4 0.404 0.522 0.503 0.723 \n",
1598
+ "... ... ... ... ... \n",
1599
+ "1171 0.388 0.585 0.560 0.875 \n",
1600
+ "1172 0.401 0.577 0.567 0.874 \n",
1601
+ "1173 0.401 0.569 0.556 0.874 \n",
1602
+ "1174 0.398 0.572 0.558 0.877 \n",
1603
+ "1175 0.402 0.573 0.559 0.867 \n",
1604
+ "\n",
1605
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
1606
+ "0 0.202 0.2195 0.2510 0.230294 0.250147 \n",
1607
+ "1 0.518 0.2680 0.2935 0.238951 0.250399 \n",
1608
+ "2 0.577 0.3065 0.3230 0.247275 0.255482 \n",
1609
+ "3 0.621 0.3175 0.3340 0.254534 0.267363 \n",
1610
+ "4 0.618 0.3255 0.3470 0.254762 0.263563 \n",
1611
+ "... ... ... ... ... ... \n",
1612
+ "1171 0.820 0.4475 0.4450 0.299378 0.326313 \n",
1613
+ "1172 0.806 0.4465 0.4355 0.302083 0.331563 \n",
1614
+ "1173 0.811 0.4460 0.4455 0.305193 0.331708 \n",
1615
+ "1174 0.811 0.4525 0.4385 0.301952 0.331295 \n",
1616
+ "1175 0.802 0.4475 0.4375 0.301934 0.330810 \n",
1617
+ "\n",
1618
+ "[1176 rows x 21 columns]"
1619
+ ]
1620
+ },
1621
+ "execution_count": 16,
1622
+ "metadata": {},
1623
+ "output_type": "execute_result"
1624
+ }
1625
+ ],
1626
+ "source": [
1627
+ "df"
1628
+ ]
1629
+ },
1630
+ {
1631
+ "cell_type": "code",
1632
+ "execution_count": 24,
1633
+ "metadata": {},
1634
+ "outputs": [
1635
+ {
1636
+ "name": "stderr",
1637
+ "output_type": "stream",
1638
+ "text": [
1639
+ "Fetching datafiles...: 100%|██████████| 4/4 [00:00<00:00, 21.68it/s]\n",
1640
+ "Loading evals data...: 100%|██████████| 26/26 [00:04<00:00, 5.76it/s]"
1641
+ ]
1642
+ },
1643
+ {
1644
+ "name": "stdout",
1645
+ "output_type": "stream",
1646
+ "text": [
1647
+ "Metrics saved to loubna-ablations_faq_metrics.csv\n"
1648
+ ]
1649
+ },
1650
+ {
1651
+ "name": "stderr",
1652
+ "output_type": "stream",
1653
+ "text": [
1654
+ "\n"
1655
+ ]
1656
+ }
1657
+ ],
1658
+ "source": [
1659
+ "token = os.getenv(\"HF_TOKEN\")\n",
1660
+ "repo_name = \"HuggingFaceTB/loubna-ablations_faq\"\n",
1661
+ "runs_to_fetch = [\"filtered_web_min_score_4_fix-seed-1-\", \"fineweb_2B_educational_minimum_score_3-seed-0-\", \"fineweb_2B_educational_regression-seed-6-\", \"fineweb_2024_10_all_2B-seed-6-\"]\n",
1662
+ "steps_to_fetch = \"%1000\"\n",
1663
+ "prefix = \"tb/ablations_faq-1p81G-\"\n",
1664
+ "metrics = ['commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
1665
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
1666
+ "agg_score_columns = metrics\n",
1667
+ "column_name = \"agg_score\"\n",
1668
+ "seed_merge_method = \"mean\"\n",
1669
+ "oauth_token = token\n",
1670
+ "\n",
1671
+ "runs_to_fetch = [prefix + run for run in runs_to_fetch]\n",
1672
+ "df = fetch_run_results_simple(repo_name, runs_to_fetch, steps_to_fetch, prefix, agg_score_columns, column_name, seed_merge_method, oauth_token=token)"
1673
+ ]
1674
+ },
1675
+ {
1676
+ "cell_type": "code",
1677
+ "execution_count": 25,
1678
+ "metadata": {},
1679
+ "outputs": [
1680
+ {
1681
+ "data": {
1682
+ "text/html": [
1683
+ "<div>\n",
1684
+ "<style scoped>\n",
1685
+ " .dataframe tbody tr th:only-of-type {\n",
1686
+ " vertical-align: middle;\n",
1687
+ " }\n",
1688
+ "\n",
1689
+ " .dataframe tbody tr th {\n",
1690
+ " vertical-align: top;\n",
1691
+ " }\n",
1692
+ "\n",
1693
+ " .dataframe thead th {\n",
1694
+ " text-align: right;\n",
1695
+ " }\n",
1696
+ "</style>\n",
1697
+ "<table border=\"1\" class=\"dataframe\">\n",
1698
+ " <thead>\n",
1699
+ " <tr style=\"text-align: right;\">\n",
1700
+ " <th></th>\n",
1701
+ " <th>runname</th>\n",
1702
+ " <th>seed</th>\n",
1703
+ " <th>steps</th>\n",
1704
+ " <th>agg_score</th>\n",
1705
+ " <th>commonsense_qa/acc</th>\n",
1706
+ " <th>commonsense_qa/acc_norm</th>\n",
1707
+ " <th>hellaswag/acc</th>\n",
1708
+ " <th>hellaswag/acc_norm</th>\n",
1709
+ " <th>openbookqa/acc</th>\n",
1710
+ " <th>openbookqa/acc_norm</th>\n",
1711
+ " <th>...</th>\n",
1712
+ " <th>siqa/acc</th>\n",
1713
+ " <th>siqa/acc_norm</th>\n",
1714
+ " <th>winogrande/acc</th>\n",
1715
+ " <th>winogrande/acc_norm</th>\n",
1716
+ " <th>all/acc</th>\n",
1717
+ " <th>all/acc_norm</th>\n",
1718
+ " <th>arc/acc</th>\n",
1719
+ " <th>arc/acc_norm</th>\n",
1720
+ " <th>mmlu/acc</th>\n",
1721
+ " <th>mmlu/acc_norm</th>\n",
1722
+ " </tr>\n",
1723
+ " </thead>\n",
1724
+ " <tbody>\n",
1725
+ " <tr>\n",
1726
+ " <th>0</th>\n",
1727
+ " <td>FineWeb (FW)</td>\n",
1728
+ " <td>0</td>\n",
1729
+ " <td>4000</td>\n",
1730
+ " <td>0.389983</td>\n",
1731
+ " <td>0.275</td>\n",
1732
+ " <td>0.281</td>\n",
1733
+ " <td>0.352</td>\n",
1734
+ " <td>0.383</td>\n",
1735
+ " <td>0.152</td>\n",
1736
+ " <td>0.286</td>\n",
1737
+ " <td>...</td>\n",
1738
+ " <td>0.365</td>\n",
1739
+ " <td>0.385</td>\n",
1740
+ " <td>0.505</td>\n",
1741
+ " <td>0.493</td>\n",
1742
+ " <td>0.265054</td>\n",
1743
+ " <td>0.281046</td>\n",
1744
+ " <td>0.3265</td>\n",
1745
+ " <td>0.3435</td>\n",
1746
+ " <td>0.250500</td>\n",
1747
+ " <td>0.264368</td>\n",
1748
+ " </tr>\n",
1749
+ " <tr>\n",
1750
+ " <th>0</th>\n",
1751
+ " <td>FineWeb (FW)</td>\n",
1752
+ " <td>0</td>\n",
1753
+ " <td>5000</td>\n",
1754
+ " <td>0.397987</td>\n",
1755
+ " <td>0.303</td>\n",
1756
+ " <td>0.297</td>\n",
1757
+ " <td>0.349</td>\n",
1758
+ " <td>0.397</td>\n",
1759
+ " <td>0.154</td>\n",
1760
+ " <td>0.290</td>\n",
1761
+ " <td>...</td>\n",
1762
+ " <td>0.375</td>\n",
1763
+ " <td>0.383</td>\n",
1764
+ " <td>0.509</td>\n",
1765
+ " <td>0.502</td>\n",
1766
+ " <td>0.268548</td>\n",
1767
+ " <td>0.282678</td>\n",
1768
+ " <td>0.3340</td>\n",
1769
+ " <td>0.3560</td>\n",
1770
+ " <td>0.253134</td>\n",
1771
+ " <td>0.264896</td>\n",
1772
+ " </tr>\n",
1773
+ " <tr>\n",
1774
+ " <th>0</th>\n",
1775
+ " <td>FineWeb (FW)</td>\n",
1776
+ " <td>0</td>\n",
1777
+ " <td>6000</td>\n",
1778
+ " <td>0.403954</td>\n",
1779
+ " <td>0.317</td>\n",
1780
+ " <td>0.319</td>\n",
1781
+ " <td>0.359</td>\n",
1782
+ " <td>0.416</td>\n",
1783
+ " <td>0.166</td>\n",
1784
+ " <td>0.284</td>\n",
1785
+ " <td>...</td>\n",
1786
+ " <td>0.379</td>\n",
1787
+ " <td>0.400</td>\n",
1788
+ " <td>0.516</td>\n",
1789
+ " <td>0.490</td>\n",
1790
+ " <td>0.268197</td>\n",
1791
+ " <td>0.286678</td>\n",
1792
+ " <td>0.3330</td>\n",
1793
+ " <td>0.3590</td>\n",
1794
+ " <td>0.252102</td>\n",
1795
+ " <td>0.268633</td>\n",
1796
+ " </tr>\n",
1797
+ " <tr>\n",
1798
+ " <th>0</th>\n",
1799
+ " <td>FineWeb (FW)</td>\n",
1800
+ " <td>0</td>\n",
1801
+ " <td>7000</td>\n",
1802
+ " <td>0.404859</td>\n",
1803
+ " <td>0.298</td>\n",
1804
+ " <td>0.310</td>\n",
1805
+ " <td>0.367</td>\n",
1806
+ " <td>0.424</td>\n",
1807
+ " <td>0.176</td>\n",
1808
+ " <td>0.290</td>\n",
1809
+ " <td>...</td>\n",
1810
+ " <td>0.382</td>\n",
1811
+ " <td>0.396</td>\n",
1812
+ " <td>0.511</td>\n",
1813
+ " <td>0.494</td>\n",
1814
+ " <td>0.271701</td>\n",
1815
+ " <td>0.289459</td>\n",
1816
+ " <td>0.3250</td>\n",
1817
+ " <td>0.3510</td>\n",
1818
+ " <td>0.256203</td>\n",
1819
+ " <td>0.271874</td>\n",
1820
+ " </tr>\n",
1821
+ " <tr>\n",
1822
+ " <th>0</th>\n",
1823
+ " <td>FineWeb (FW)</td>\n",
1824
+ " <td>0</td>\n",
1825
+ " <td>8000</td>\n",
1826
+ " <td>0.403283</td>\n",
1827
+ " <td>0.330</td>\n",
1828
+ " <td>0.319</td>\n",
1829
+ " <td>0.364</td>\n",
1830
+ " <td>0.412</td>\n",
1831
+ " <td>0.176</td>\n",
1832
+ " <td>0.276</td>\n",
1833
+ " <td>...</td>\n",
1834
+ " <td>0.383</td>\n",
1835
+ " <td>0.403</td>\n",
1836
+ " <td>0.510</td>\n",
1837
+ " <td>0.493</td>\n",
1838
+ " <td>0.267533</td>\n",
1839
+ " <td>0.287018</td>\n",
1840
+ " <td>0.3295</td>\n",
1841
+ " <td>0.3510</td>\n",
1842
+ " <td>0.251046</td>\n",
1843
+ " <td>0.269266</td>\n",
1844
+ " </tr>\n",
1845
+ " </tbody>\n",
1846
+ "</table>\n",
1847
+ "<p>5 rows × 22 columns</p>\n",
1848
+ "</div>"
1849
+ ],
1850
+ "text/plain": [
1851
+ " runname seed steps agg_score commonsense_qa/acc \\\n",
1852
+ "0 FineWeb (FW) 0 4000 0.389983 0.275 \n",
1853
+ "0 FineWeb (FW) 0 5000 0.397987 0.303 \n",
1854
+ "0 FineWeb (FW) 0 6000 0.403954 0.317 \n",
1855
+ "0 FineWeb (FW) 0 7000 0.404859 0.298 \n",
1856
+ "0 FineWeb (FW) 0 8000 0.403283 0.330 \n",
1857
+ "\n",
1858
+ " commonsense_qa/acc_norm hellaswag/acc hellaswag/acc_norm openbookqa/acc \\\n",
1859
+ "0 0.281 0.352 0.383 0.152 \n",
1860
+ "0 0.297 0.349 0.397 0.154 \n",
1861
+ "0 0.319 0.359 0.416 0.166 \n",
1862
+ "0 0.310 0.367 0.424 0.176 \n",
1863
+ "0 0.319 0.364 0.412 0.176 \n",
1864
+ "\n",
1865
+ " openbookqa/acc_norm ... siqa/acc siqa/acc_norm winogrande/acc \\\n",
1866
+ "0 0.286 ... 0.365 0.385 0.505 \n",
1867
+ "0 0.290 ... 0.375 0.383 0.509 \n",
1868
+ "0 0.284 ... 0.379 0.400 0.516 \n",
1869
+ "0 0.290 ... 0.382 0.396 0.511 \n",
1870
+ "0 0.276 ... 0.383 0.403 0.510 \n",
1871
+ "\n",
1872
+ " winogrande/acc_norm all/acc all/acc_norm arc/acc arc/acc_norm \\\n",
1873
+ "0 0.493 0.265054 0.281046 0.3265 0.3435 \n",
1874
+ "0 0.502 0.268548 0.282678 0.3340 0.3560 \n",
1875
+ "0 0.490 0.268197 0.286678 0.3330 0.3590 \n",
1876
+ "0 0.494 0.271701 0.289459 0.3250 0.3510 \n",
1877
+ "0 0.493 0.267533 0.287018 0.3295 0.3510 \n",
1878
+ "\n",
1879
+ " mmlu/acc mmlu/acc_norm \n",
1880
+ "0 0.250500 0.264368 \n",
1881
+ "0 0.253134 0.264896 \n",
1882
+ "0 0.252102 0.268633 \n",
1883
+ "0 0.256203 0.271874 \n",
1884
+ "0 0.251046 0.269266 \n",
1885
+ "\n",
1886
+ "[5 rows x 22 columns]"
1887
+ ]
1888
+ },
1889
+ "execution_count": 25,
1890
+ "metadata": {},
1891
+ "output_type": "execute_result"
1892
+ }
1893
+ ],
1894
+ "source": [
1895
+ "df['runname'] = df['runname'].replace({\"filtered_web_min_score_4_fix-seed-1\": \"FW-Edu-threshold=4\",\n",
1896
+ " \"fineweb_2B_educational_minimum_score_3-seed-0\": \"FW-Edu-threshold=3\",\n",
1897
+ " \"fineweb_2B_educational_regression-seed-6\": \"FW-Edu-threshold=2\",\n",
1898
+ " \"fineweb_2024_10_all_2B-seed-6\": \"FineWeb (FW)\"}, regex=True)\n",
1899
+ "df.tail()"
1900
+ ]
1901
+ },
1902
+ {
1903
+ "cell_type": "code",
1904
+ "execution_count": 26,
1905
+ "metadata": {},
1906
+ "outputs": [
1907
+ {
1908
+ "data": {
1909
+ "text/plain": [
1910
+ "0 FW-Edu-threshold=4\n",
1911
+ "0 FW-Edu-threshold=4\n",
1912
+ "0 FW-Edu-threshold=4\n",
1913
+ "0 FW-Edu-threshold=4\n",
1914
+ "0 FW-Edu-threshold=4\n",
1915
+ "0 FW-Edu-threshold=4\n",
1916
+ "0 FW-Edu-threshold=4\n",
1917
+ "0 FW-Edu-threshold=4\n",
1918
+ "0 FW-Edu-threshold=3\n",
1919
+ "0 FW-Edu-threshold=3\n",
1920
+ "0 FW-Edu-threshold=3\n",
1921
+ "0 FW-Edu-threshold=3\n",
1922
+ "0 FW-Edu-threshold=3\n",
1923
+ "0 FW-Edu-threshold=2\n",
1924
+ "0 FW-Edu-threshold=2\n",
1925
+ "0 FW-Edu-threshold=2\n",
1926
+ "0 FW-Edu-threshold=2\n",
1927
+ "0 FW-Edu-threshold=2\n",
1928
+ "0 FineWeb (FW)\n",
1929
+ "0 FineWeb (FW)\n",
1930
+ "0 FineWeb (FW)\n",
1931
+ "0 FineWeb (FW)\n",
1932
+ "0 FineWeb (FW)\n",
1933
+ "0 FineWeb (FW)\n",
1934
+ "0 FineWeb (FW)\n",
1935
+ "0 FineWeb (FW)\n",
1936
+ "Name: runname, dtype: object"
1937
+ ]
1938
+ },
1939
+ "execution_count": 26,
1940
+ "metadata": {},
1941
+ "output_type": "execute_result"
1942
+ }
1943
+ ],
1944
+ "source": [
1945
+ "df[\"runname\"]"
1946
+ ]
1947
+ },
1948
+ {
1949
+ "cell_type": "code",
1950
+ "execution_count": null,
1951
+ "metadata": {},
1952
+ "outputs": [],
1953
+ "source": []
1954
+ },
1955
+ {
1956
+ "cell_type": "code",
1957
+ "execution_count": 34,
1958
+ "metadata": {},
1959
+ "outputs": [
1960
+ {
1961
+ "name": "stdout",
1962
+ "output_type": "stream",
1963
+ "text": [
1964
+ "Plot saved to plots/edu-8k.png\n"
1965
+ ]
1966
+ }
1967
+ ],
1968
+ "source": [
1969
+ "\n",
1970
+ "metrics = [\n",
1971
+ " \"mmlu/acc_norm\",\n",
1972
+ " \"arc/acc_norm\",\n",
1973
+ " \"openbookqa/acc_norm\",\n",
1974
+ " \"piqa/acc_norm\",\n",
1975
+ " \"hellaswag/acc_norm\",\n",
1976
+ " \"siqa/acc_norm\",\n",
1977
+ " \"winogrande/acc_norm\",\n",
1978
+ "]\n",
1979
+ "plot_metric_comparison(df, 8000, metrics, output_file=\"edu-8k\", plot_name=\"FineWeb-Edu thresholding\", custom_layout={\n",
1980
+ " \"xaxis\": {\n",
1981
+ " \"title\": {\n",
1982
+ " \"standoff\": 60,\n",
1983
+ " \"text\": \"Dataset\"\n",
1984
+ " },\n",
1985
+ " \"tickangle\": 30\n",
1986
+ " },\n",
1987
+ " \"margin\": {\n",
1988
+ " \"b\": 120\n",
1989
+ " }\n",
1990
+ "})"
1991
+ ]
1992
+ }
1993
+ ],
1994
+ "metadata": {
1995
+ "kernelspec": {
1996
+ "display_name": "textbooks",
1997
+ "language": "python",
1998
+ "name": "python3"
1999
+ },
2000
+ "language_info": {
2001
+ "codemirror_mode": {
2002
+ "name": "ipython",
2003
+ "version": 3
2004
+ },
2005
+ "file_extension": ".py",
2006
+ "mimetype": "text/x-python",
2007
+ "name": "python",
2008
+ "nbconvert_exporter": "python",
2009
+ "pygments_lexer": "ipython3",
2010
+ "version": "3.12.2"
2011
+ }
2012
+ },
2013
+ "nbformat": 4,
2014
+ "nbformat_minor": 2
2015
+ }
notebooks/check_decontamination.ipynb ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "id": "initial_id",
6
+ "metadata": {
7
+ "collapsed": true,
8
+ "ExecuteTime": {
9
+ "end_time": "2024-05-15T07:49:59.747703Z",
10
+ "start_time": "2024-05-15T07:49:59.134058Z"
11
+ }
12
+ },
13
+ "source": [
14
+ "import pandas as pd\n",
15
+ "\n",
16
+ "df = pd.read_csv('/home/gui/hf_dev/datatrove/blogpost/data/decont_ngrams-per_dump.csv')"
17
+ ],
18
+ "execution_count": 1,
19
+ "outputs": []
20
+ },
21
+ {
22
+ "metadata": {
23
+ "ExecuteTime": {
24
+ "end_time": "2024-05-15T07:51:52.324884Z",
25
+ "start_time": "2024-05-15T07:51:52.283371Z"
26
+ }
27
+ },
28
+ "cell_type": "code",
29
+ "source": "df = df.groupby([\"ngram\", \"task\"], as_index=False)[\"count\"].sum().sort_values(\"count\", ascending=False)",
30
+ "id": "c691b2709c417bf4",
31
+ "execution_count": 8,
32
+ "outputs": []
33
+ },
34
+ {
35
+ "metadata": {
36
+ "ExecuteTime": {
37
+ "end_time": "2024-05-15T07:52:17.954219Z",
38
+ "start_time": "2024-05-15T07:52:17.938060Z"
39
+ }
40
+ },
41
+ "cell_type": "code",
42
+ "source": "df.to_csv('/home/gui/hf_dev/datatrove/blogpost/data/decont_ngrams-global.csv', index=False)",
43
+ "id": "9c0dfcd486f8e260",
44
+ "execution_count": 9,
45
+ "outputs": []
46
+ },
47
+ {
48
+ "metadata": {},
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "source": "",
52
+ "id": "d5fef0e4bc91a43e",
53
+ "outputs": []
54
+ }
55
+ ],
56
+ "metadata": {
57
+ "kernelspec": {
58
+ "display_name": "Python 3",
59
+ "language": "python",
60
+ "name": "python3"
61
+ },
62
+ "language_info": {
63
+ "codemirror_mode": {
64
+ "name": "ipython",
65
+ "version": 2
66
+ },
67
+ "file_extension": ".py",
68
+ "mimetype": "text/x-python",
69
+ "name": "python",
70
+ "nbconvert_exporter": "python",
71
+ "pygments_lexer": "ipython2",
72
+ "version": "2.7.6"
73
+ }
74
+ },
75
+ "nbformat": 4,
76
+ "nbformat_minor": 5
77
+ }
notebooks/check_top_60k_change.ipynb ADDED
@@ -0,0 +1,1424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "id": "initial_id",
6
+ "metadata": {
7
+ "collapsed": true,
8
+ "ExecuteTime": {
9
+ "end_time": "2024-05-14T15:00:26.015772Z",
10
+ "start_time": "2024-05-14T15:00:25.963139Z"
11
+ }
12
+ },
13
+ "source": [
14
+ "import pandas as pd\n",
15
+ "\n",
16
+ "df_43 = pd.read_csv('/home/gui/hf_dev/datatrove/blogpost/data/top_60k_urls_CC_MAIN_2021_43.csv').sort_values(by=\"Frequency\", ascending=False)\n",
17
+ "df_49 = pd.read_csv('/home/gui/hf_dev/datatrove/blogpost/data/top_60k_urls_CC_MAIN_2021_49.csv').sort_values(by=\"Frequency\", ascending=False)"
18
+ ],
19
+ "outputs": [],
20
+ "execution_count": 27
21
+ },
22
+ {
23
+ "metadata": {
24
+ "ExecuteTime": {
25
+ "end_time": "2024-05-14T15:00:27.625974Z",
26
+ "start_time": "2024-05-14T15:00:26.358729Z"
27
+ }
28
+ },
29
+ "cell_type": "code",
30
+ "source": "freqs_49 = {row[1][\"URL\"]: row[1][\"Frequency\"] for row in df_49.iterrows()}",
31
+ "id": "6a21a1ed442a6d79",
32
+ "outputs": [],
33
+ "execution_count": 28
34
+ },
35
+ {
36
+ "metadata": {
37
+ "ExecuteTime": {
38
+ "end_time": "2024-05-14T15:00:28.029676Z",
39
+ "start_time": "2024-05-14T15:00:27.626997Z"
40
+ }
41
+ },
42
+ "cell_type": "code",
43
+ "source": [
44
+ "df_43['in_49'] = df_43.apply(lambda row: freqs_49.get(row[\"URL\"], 0), axis=1)\n",
45
+ "df_43['change_to_49'] = df_43.apply(lambda row: freqs_49.get(row[\"URL\"], 0) - row[\"Frequency\"], axis=1)"
46
+ ],
47
+ "id": "bc7cdf0d04ff0d0",
48
+ "outputs": [],
49
+ "execution_count": 29
50
+ },
51
+ {
52
+ "metadata": {
53
+ "ExecuteTime": {
54
+ "end_time": "2024-05-14T15:00:28.035756Z",
55
+ "start_time": "2024-05-14T15:00:28.030629Z"
56
+ }
57
+ },
58
+ "cell_type": "code",
59
+ "source": "df_43",
60
+ "id": "990f471d499d7064",
61
+ "outputs": [
62
+ {
63
+ "data": {
64
+ "text/plain": [
65
+ " URL Frequency in_49 change_to_49\n",
66
+ "0 worldwidescience.org 0.001337 0.001317 -2.004393e-05\n",
67
+ "1 issuu.com 0.001006 0.000992 -1.395550e-05\n",
68
+ "2 en.wikipedia.org 0.000824 0.001125 3.004770e-04\n",
69
+ "3 caselaw.findlaw.com 0.000661 0.000228 -4.330254e-04\n",
70
+ "4 www.frontiersin.org 0.000611 0.000402 -2.088168e-04\n",
71
+ "... ... ... ... ...\n",
72
+ "59995 www.basketballghana.com 0.000001 0.000002 7.485694e-07\n",
73
+ "59996 meisendorf.com 0.000001 0.000000 -1.323341e-06\n",
74
+ "59997 www.anyrubbish.co.uk 0.000001 0.000000 -1.323290e-06\n",
75
+ "59998 qjshhxx.cn 0.000001 0.000000 -1.323239e-06\n",
76
+ "59999 www.al-enterprise.com 0.000001 0.000000 -1.323225e-06\n",
77
+ "\n",
78
+ "[60000 rows x 4 columns]"
79
+ ],
80
+ "text/html": [
81
+ "<div>\n",
82
+ "<style scoped>\n",
83
+ " .dataframe tbody tr th:only-of-type {\n",
84
+ " vertical-align: middle;\n",
85
+ " }\n",
86
+ "\n",
87
+ " .dataframe tbody tr th {\n",
88
+ " vertical-align: top;\n",
89
+ " }\n",
90
+ "\n",
91
+ " .dataframe thead th {\n",
92
+ " text-align: right;\n",
93
+ " }\n",
94
+ "</style>\n",
95
+ "<table border=\"1\" class=\"dataframe\">\n",
96
+ " <thead>\n",
97
+ " <tr style=\"text-align: right;\">\n",
98
+ " <th></th>\n",
99
+ " <th>URL</th>\n",
100
+ " <th>Frequency</th>\n",
101
+ " <th>in_49</th>\n",
102
+ " <th>change_to_49</th>\n",
103
+ " </tr>\n",
104
+ " </thead>\n",
105
+ " <tbody>\n",
106
+ " <tr>\n",
107
+ " <th>0</th>\n",
108
+ " <td>worldwidescience.org</td>\n",
109
+ " <td>0.001337</td>\n",
110
+ " <td>0.001317</td>\n",
111
+ " <td>-2.004393e-05</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>1</th>\n",
115
+ " <td>issuu.com</td>\n",
116
+ " <td>0.001006</td>\n",
117
+ " <td>0.000992</td>\n",
118
+ " <td>-1.395550e-05</td>\n",
119
+ " </tr>\n",
120
+ " <tr>\n",
121
+ " <th>2</th>\n",
122
+ " <td>en.wikipedia.org</td>\n",
123
+ " <td>0.000824</td>\n",
124
+ " <td>0.001125</td>\n",
125
+ " <td>3.004770e-04</td>\n",
126
+ " </tr>\n",
127
+ " <tr>\n",
128
+ " <th>3</th>\n",
129
+ " <td>caselaw.findlaw.com</td>\n",
130
+ " <td>0.000661</td>\n",
131
+ " <td>0.000228</td>\n",
132
+ " <td>-4.330254e-04</td>\n",
133
+ " </tr>\n",
134
+ " <tr>\n",
135
+ " <th>4</th>\n",
136
+ " <td>www.frontiersin.org</td>\n",
137
+ " <td>0.000611</td>\n",
138
+ " <td>0.000402</td>\n",
139
+ " <td>-2.088168e-04</td>\n",
140
+ " </tr>\n",
141
+ " <tr>\n",
142
+ " <th>...</th>\n",
143
+ " <td>...</td>\n",
144
+ " <td>...</td>\n",
145
+ " <td>...</td>\n",
146
+ " <td>...</td>\n",
147
+ " </tr>\n",
148
+ " <tr>\n",
149
+ " <th>59995</th>\n",
150
+ " <td>www.basketballghana.com</td>\n",
151
+ " <td>0.000001</td>\n",
152
+ " <td>0.000002</td>\n",
153
+ " <td>7.485694e-07</td>\n",
154
+ " </tr>\n",
155
+ " <tr>\n",
156
+ " <th>59996</th>\n",
157
+ " <td>meisendorf.com</td>\n",
158
+ " <td>0.000001</td>\n",
159
+ " <td>0.000000</td>\n",
160
+ " <td>-1.323341e-06</td>\n",
161
+ " </tr>\n",
162
+ " <tr>\n",
163
+ " <th>59997</th>\n",
164
+ " <td>www.anyrubbish.co.uk</td>\n",
165
+ " <td>0.000001</td>\n",
166
+ " <td>0.000000</td>\n",
167
+ " <td>-1.323290e-06</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>59998</th>\n",
171
+ " <td>qjshhxx.cn</td>\n",
172
+ " <td>0.000001</td>\n",
173
+ " <td>0.000000</td>\n",
174
+ " <td>-1.323239e-06</td>\n",
175
+ " </tr>\n",
176
+ " <tr>\n",
177
+ " <th>59999</th>\n",
178
+ " <td>www.al-enterprise.com</td>\n",
179
+ " <td>0.000001</td>\n",
180
+ " <td>0.000000</td>\n",
181
+ " <td>-1.323225e-06</td>\n",
182
+ " </tr>\n",
183
+ " </tbody>\n",
184
+ "</table>\n",
185
+ "<p>60000 rows × 4 columns</p>\n",
186
+ "</div>"
187
+ ]
188
+ },
189
+ "execution_count": 30,
190
+ "metadata": {},
191
+ "output_type": "execute_result"
192
+ }
193
+ ],
194
+ "execution_count": 30
195
+ },
196
+ {
197
+ "metadata": {
198
+ "ExecuteTime": {
199
+ "end_time": "2024-05-14T14:59:00.727813Z",
200
+ "start_time": "2024-05-14T14:59:00.713582Z"
201
+ }
202
+ },
203
+ "cell_type": "code",
204
+ "source": "freqs_49",
205
+ "id": "6d9dec8ef32e61b2",
206
+ "outputs": [
207
+ {
208
+ "data": {
209
+ "text/plain": [
210
+ "{'worldwidescience.org': 0.0013371601446593,\n",
211
+ " 'issuu.com': 0.0010055409292792,\n",
212
+ " 'en.wikipedia.org': 0.0008242551989742,\n",
213
+ " 'caselaw.findlaw.com': 0.0006608764065299,\n",
214
+ " 'www.frontiersin.org': 0.0006109349846815,\n",
215
+ " 'chroniclingamerica.loc.gov': 0.0005375644622649,\n",
216
+ " 'www.nature.com': 0.0005266164399221,\n",
217
+ " 'lookformedical.com': 0.0004704824927555,\n",
218
+ " 'en.m.wikipedia.org': 0.0004443793906608,\n",
219
+ " 'ufdc.ufl.edu': 0.0004426934126914,\n",
220
+ " 'journals.plos.org': 0.0004371460070378,\n",
221
+ " 'casetext.com': 0.0003903933122818,\n",
222
+ " 'hansard.parliament.uk': 0.0003611472676514,\n",
223
+ " 'search.audioburst.com': 0.0003535316041847,\n",
224
+ " 'ivy.fm': 0.0003530053155805,\n",
225
+ " 'link.springer.com': 0.0003286474058102,\n",
226
+ " 'www.newageislam.com': 0.0003196897846979,\n",
227
+ " 'www.studylight.org': 0.0003159274076666,\n",
228
+ " 'panewsarchive.psu.edu': 0.0003121250457216,\n",
229
+ " 'www.theguardian.com': 0.0003038877330206,\n",
230
+ " 'bmcgenomics.biomedcentral.com': 0.0002553320996198,\n",
231
+ " 'law.justia.com': 0.0002463165002013,\n",
232
+ " 'www.hotfreebooks.com': 0.0002438439057772,\n",
233
+ " 'www.hindawi.com': 0.0002374853040569,\n",
234
+ " 'zims-en.kiwix.campusafrica.gos.orange.com': 0.0002318541153106,\n",
235
+ " 'discover.hubpages.com': 0.0002314644626707,\n",
236
+ " 'www.fanfiction.net': 0.00023022444825,\n",
237
+ " 'www.huffpost.com': 0.0002272851203057,\n",
238
+ " 'nebnewspapers.unl.edu': 0.0002254127055625,\n",
239
+ " 'www.angi.com': 0.0002231034435845,\n",
240
+ " 'www.theyworkforyou.com': 0.0002208272250724,\n",
241
+ " 'www.ola.org': 0.000218709862407,\n",
242
+ " 'www.wikihow.com': 0.0002180965139118,\n",
243
+ " 'acp.copernicus.org': 0.0002111536489058,\n",
244
+ " 'www.semanticscholar.org': 0.0001950019041289,\n",
245
+ " 'www.theatlantic.com': 0.0001913746442804,\n",
246
+ " 'docplayer.net': 0.0001907412725177,\n",
247
+ " 'irclogs.ubuntu.com': 0.0001898102449522,\n",
248
+ " 'www.forbes.com': 0.00018939073865,\n",
249
+ " 'publications.waset.org': 0.0001873821881616,\n",
250
+ " 'www.washingtonpost.com': 0.0001866054782221,\n",
251
+ " 'subsaga.com': 0.0001865089939112,\n",
252
+ " 'www.nybooks.com': 0.000184557446664,\n",
253
+ " 'newspapers.library.wales': 0.0001845348751784,\n",
254
+ " 'www.dailymail.co.uk': 0.0001837949675985,\n",
255
+ " 'deepai.org': 0.0001831206612305,\n",
256
+ " 'www.gastearsivi.com': 0.0001817473130172,\n",
257
+ " 'transparentpng.netlify.app': 0.0001812971084796,\n",
258
+ " 'api.parliament.uk': 0.0001797907418292,\n",
259
+ " 'pubmed.ncbi.nlm.nih.gov': 0.0001747089206415,\n",
260
+ " 'gateway.ipfs.io': 0.0001725078365045,\n",
261
+ " 'www.thefreelibrary.com': 0.0001708132388158,\n",
262
+ " 'alumnius.net': 0.0001683239812924,\n",
263
+ " 'www.yumpu.com': 0.0001673814233117,\n",
264
+ " 'www.inkitt.com': 0.0001636112276366,\n",
265
+ " 'm.fanfiction.net': 0.0001552928359265,\n",
266
+ " 'contracts.justia.com': 0.0001548855632462,\n",
267
+ " 'patents.google.com': 0.0001517315288932,\n",
268
+ " 'www.latimes.com': 0.0001510611082861,\n",
269
+ " 'www.arxiv-vanity.com': 0.0001493404557011,\n",
270
+ " 'bmcpublichealth.biomedcentral.com': 0.0001492791353507,\n",
271
+ " 'www.telegraph.co.uk': 0.0001486748451729,\n",
272
+ " 'archiveofourown.org': 0.0001479185499767,\n",
273
+ " 'www.economist.com': 0.0001453357732099,\n",
274
+ " 'studylib.net': 0.0001453062312784,\n",
275
+ " 'www.espn.com': 0.0001440956536707,\n",
276
+ " 'hubpages.com': 0.0001390828482346,\n",
277
+ " 'time.com': 0.000137760710869,\n",
278
+ " 'www.ipl.org': 0.0001376025183566,\n",
279
+ " 'robineko.com': 0.0001360150473236,\n",
280
+ " 'www.advocatekhoj.com': 0.0001359044227581,\n",
281
+ " 'www.tumgir.com': 0.000135691851343,\n",
282
+ " 'www.counterpunch.org': 0.0001340337950299,\n",
283
+ " 'tropedia.fandom.com': 0.000133800765374,\n",
284
+ " 'www.encyclopedia.com': 0.00013326334508,\n",
285
+ " 'www.nbcnews.com': 0.0001321288877867,\n",
286
+ " 'veranstaltungen.badminton.de': 0.0001317963108835,\n",
287
+ " 'slate.com': 0.0001308241783483,\n",
288
+ " 'patents.justia.com': 0.0001259783845068,\n",
289
+ " 'www.british-history.ac.uk': 0.0001259638274029,\n",
290
+ " 'bmchealthservres.biomedcentral.com': 0.0001243383360853,\n",
291
+ " 'dokumen.pub': 0.0001212929181714,\n",
292
+ " 'alchetron.com': 0.0001211202772984,\n",
293
+ " 'scocal.stanford.edu': 0.000121076402999,\n",
294
+ " 'www.avclub.com': 0.0001200605774033,\n",
295
+ " 'www.preceptaustin.org': 0.0001200404127696,\n",
296
+ " 'www.politico.com': 0.0001198441927229,\n",
297
+ " 'voracity.e-fic.com': 0.0001192375681888,\n",
298
+ " 'bmcbioinformatics.biomedcentral.com': 0.0001189960674294,\n",
299
+ " 'www.smh.com.au': 0.0001184023181027,\n",
300
+ " 'datatracker.ietf.org': 0.0001181400473718,\n",
301
+ " 'www.seattletimes.com': 0.0001175724181873,\n",
302
+ " 'm.scirp.org': 0.0001172250594441,\n",
303
+ " 'www.independent.co.uk': 0.0001172085630845,\n",
304
+ " 'www.theglobeandmail.com': 0.0001169944801374,\n",
305
+ " 'blogs.sap.com': 0.0001168635749453,\n",
306
+ " 'www.airtasker.com': 0.0001165835971794,\n",
307
+ " 'bmccancer.biomedcentral.com': 0.0001165253723884,\n",
308
+ " 'newrepublic.com': 0.0001156806326365,\n",
309
+ " 'en.wikisource.org': 0.0001146742133368,\n",
310
+ " 'www.newworldencyclopedia.org': 0.0001145322380761,\n",
311
+ " 'docs.justia.com': 0.0001144089739136,\n",
312
+ " 'www.fool.com': 0.0001144039245994,\n",
313
+ " 'www.vox.com': 0.0001139990805197,\n",
314
+ " 'moz.com': 0.0001138081019004,\n",
315
+ " 'www.fastcompany.com': 0.0001133714141558,\n",
316
+ " 'www.thedailybeast.com': 0.0001121551963952,\n",
317
+ " 'grantland.com': 0.0001121058124351,\n",
318
+ " 'apps.apple.com': 0.0001097454882252,\n",
319
+ " 'www.cdc.gov': 0.0001070063277096,\n",
320
+ " 'techcrunch.com': 0.0001061237416623,\n",
321
+ " 'www.bostonglobe.com': 0.0001050931342265,\n",
322
+ " 'www.creativelive.com': 0.0001047344988024,\n",
323
+ " 'www.si.com': 0.0001044359259311,\n",
324
+ " 'devrant.com': 0.0001040909704149,\n",
325
+ " 'www.opendemocracy.net': 0.0001039552116465,\n",
326
+ " 'gizmodo.com': 0.0001036007519653,\n",
327
+ " 'www.healthline.com': 0.0001035295830961,\n",
328
+ " 'thehockeywriters.com': 0.0001025606947362,\n",
329
+ " 'www.npr.org': 0.0001020513809621,\n",
330
+ " 'www.jpost.com': 0.0001018387189275,\n",
331
+ " 'www.newstatesman.com': 0.0001016472835864,\n",
332
+ " 'bleacherreport.com': 0.0001013723877576,\n",
333
+ " 'www.foxnews.com': 0.000101312256334,\n",
334
+ " 'www.apotelyt.com': 0.0001010690519295,\n",
335
+ " 'archive.org': 9.968097987253116e-05,\n",
336
+ " 'www.nzherald.co.nz': 9.956585333416228e-05,\n",
337
+ " 'truthout.org': 9.943359610027588e-05,\n",
338
+ " 'www.fictionpress.com': 9.892011745752971e-05,\n",
339
+ " 'www.politico.eu': 9.790526692680434e-05,\n",
340
+ " 'www.techradar.com': 9.74725918101852e-05,\n",
341
+ " 'www.prnewswire.com': 9.738541230153274e-05,\n",
342
+ " 'www.newyorker.com': 9.735472131572091e-05,\n",
343
+ " 'www.inquirer.com': 9.707689666725828e-05,\n",
344
+ " 'www.haaretz.com': 9.671520555620812e-05,\n",
345
+ " 'digital.bentley.umich.edu': 9.671287844945354e-05,\n",
346
+ " 'thepointsguy.com': 9.628801110222596e-05,\n",
347
+ " 'www.thoughtco.com': 9.607799153001185e-05,\n",
348
+ " 'www.openpr.com': 9.553215812886762e-05,\n",
349
+ " 'encyclopedia2.thefreedictionary.com': 9.500022574704412e-05,\n",
350
+ " 'www.usatoday.com': 9.488317082737702e-05,\n",
351
+ " 'www.democracynow.org': 9.415912889588526e-05,\n",
352
+ " 'www.cnn.com': 9.4023819599869e-05,\n",
353
+ " 'www.abc.net.au': 9.389910915143912e-05,\n",
354
+ " 'www.chicagotribune.com': 9.352145452302566e-05,\n",
355
+ " 'dictionnaire.sensagent.leparisien.fr': 9.26353234439554e-05,\n",
356
+ " 'bioresources.cnr.ncsu.edu': 9.263514582988844e-05,\n",
357
+ " 'contract4j.org': 9.26177287769977e-05,\n",
358
+ " 'www.motortrend.com': 9.25636108582342e-05,\n",
359
+ " 'www.canada.ca': 9.242836680820172e-05,\n",
360
+ " 'www.salon.com': 9.222098969833042e-05,\n",
361
+ " 'www.thestar.com': 9.19553624240566e-05,\n",
362
+ " 'www.reuters.com': 9.194116054825516e-05,\n",
363
+ " 'publications.parliament.uk': 9.19099077220275e-05,\n",
364
+ " 'tvtropes.org': 9.137703652299302e-05,\n",
365
+ " 'www.instructables.com': 9.10100169899347e-05,\n",
366
+ " 'www.stitcher.com': 9.093581418249948e-05,\n",
367
+ " 'edition.cnn.com': 9.04522725719749e-05,\n",
368
+ " 'www.inc.com': 9.042346647013876e-05,\n",
369
+ " 'mickopedia.org': 9.03179963373684e-05,\n",
370
+ " 'www.rollingstone.com': 8.991108613478614e-05,\n",
371
+ " 'oregonnews.uoregon.edu': 8.958794815059258e-05,\n",
372
+ " 'military.wikia.org': 8.950105499931337e-05,\n",
373
+ " 'supersearch.can.ucsd.edu': 8.913234632023067e-05,\n",
374
+ " 'www.rt.com': 8.907333857746101e-05,\n",
375
+ " 'dilbert.com': 8.872705639291213e-05,\n",
376
+ " 'www.instantcheckmate.com': 8.847558024756348e-05,\n",
377
+ " 'scienceblogs.com': 8.833675854275406e-05,\n",
378
+ " 'www.nap.edu': 8.760726132206345e-05,\n",
379
+ " 'venturebeat.com': 8.734926420311428e-05,\n",
380
+ " 'www.lastminute.com': 8.725486413892421e-05,\n",
381
+ " 'www.thesun.co.uk': 8.722505759867021e-05,\n",
382
+ " 'www.buzzfeednews.com': 8.717305654960247e-05,\n",
383
+ " 'ilgotha.org': 8.713850880119424e-05,\n",
384
+ " 'www.finder.com': 8.69887910170946e-05,\n",
385
+ " 'laagvlieger.x24hr.com': 8.674987109884553e-05,\n",
386
+ " 'www.cheatsheet.com': 8.580867603424703e-05,\n",
387
+ " '75-3-247-200.lightspeed.sntcca.sbcglobal.net': 8.560513031353399e-05,\n",
388
+ " 'themillions.com': 8.537252838138035e-05,\n",
389
+ " 'www.mic.com': 8.527057065740314e-05,\n",
390
+ " 'www.baltimoresun.com': 8.519745165825395e-05,\n",
391
+ " 'www.lrb.co.uk': 8.511623128278618e-05,\n",
392
+ " 'suptg.thisisnotatrueending.com': 8.504496091984392e-05,\n",
393
+ " 'www.eurasiareview.com': 8.477297578692904e-05,\n",
394
+ " 'www.news.com.au': 8.465024084189674e-05,\n",
395
+ " 'www.firstpost.com': 8.437763224735913e-05,\n",
396
+ " 'journals.lww.com': 8.429768416857495e-05,\n",
397
+ " 'insecure.archiveofourown.org': 8.37850319749566e-05,\n",
398
+ " 'www.bristolpost.co.uk': 8.359137464556217e-05,\n",
399
+ " 'www.vice.com': 8.343498364723322e-05,\n",
400
+ " 'kanithamizh.in': 8.289961860170783e-05,\n",
401
+ " '33.agilestudio.cn': 8.202731242164385e-05,\n",
402
+ " 'www.colombotelegraph.com': 8.186559662608357e-05,\n",
403
+ " 'www.icty.org': 8.181046739457158e-05,\n",
404
+ " 'www.pcmag.com': 8.167827903144583e-05,\n",
405
+ " 'www.globalsecurity.org': 8.133342138420935e-05,\n",
406
+ " 'transcripts.cnn.com': 8.113191641287768e-05,\n",
407
+ " 'www.indiewire.com': 8.110638710933774e-05,\n",
408
+ " 'www.denofgeek.com': 8.093316264719514e-05,\n",
409
+ " 'www.smithsonianmag.com': 8.085372203717366e-05,\n",
410
+ " 'www.bailii.org': 8.072180553231362e-05,\n",
411
+ " 'www.thisismoney.co.uk': 8.06264956491276e-05,\n",
412
+ " 'www.bartleby.com': 8.037844491792707e-05,\n",
413
+ " 'www.express.co.uk': 8.037329773476269e-05,\n",
414
+ " 'www.csmonitor.com': 8.036398930774429e-05,\n",
415
+ " 'www.irishexaminer.com': 8.036148096214588e-05,\n",
416
+ " 'yaledailynews.com': 8.031070146288547e-05,\n",
417
+ " 'www.texasmonthly.com': 8.021909610166704e-05,\n",
418
+ " 'fr.slideserve.com': 8.019415401198106e-05,\n",
419
+ " 'www.speakingtree.in': 8.013157586398822e-05,\n",
420
+ " 'en.m.wikiquote.org': 7.987411883679364e-05,\n",
421
+ " 'heavy.com': 7.941452975186778e-05,\n",
422
+ " 'www.mentalfloss.com': 7.925765303343739e-05,\n",
423
+ " 'www.romper.com': 7.902143719873806e-05,\n",
424
+ " 'nypost.com': 7.891734810595757e-05,\n",
425
+ " 'de.zxc.wiki': 7.869296354298256e-05,\n",
426
+ " 'www.tripsavvy.com': 7.866977222052784e-05,\n",
427
+ " 'www.inverse.com': 7.846257634950035e-05,\n",
428
+ " 'www.ukessays.com': 7.822928208496332e-05,\n",
429
+ " 'erhverv.studentersamfundet.aau.dk': 7.788662830206765e-05,\n",
430
+ " 'www.ajmc.com': 7.778079931638652e-05,\n",
431
+ " 'www.cnet.com': 7.776564049948973e-05,\n",
432
+ " 'www.aljazeera.com': 7.776164962014891e-05,\n",
433
+ " 'www.theverge.com': 7.771762670498596e-05,\n",
434
+ " 'www.kayak.com': 7.758318373064315e-05,\n",
435
+ " 'homeguides.sfgate.com': 7.750585999031743e-05,\n",
436
+ " 'medium.com': 7.744013553599594e-05,\n",
437
+ " 'philarchive.org': 7.742044574800376e-05,\n",
438
+ " 'wikimili.com': 7.71816454473916e-05,\n",
439
+ " 'www.hollywoodreporter.com': 7.709207721077766e-05,\n",
440
+ " 'www.boloji.com': 7.692111098462902e-05,\n",
441
+ " 'www.readkong.com': 7.682233943952609e-05,\n",
442
+ " 'parasitesandvectors.biomedcentral.com': 7.654496788817299e-05,\n",
443
+ " 'shakespir.com': 7.649789653565706e-05,\n",
444
+ " 'docoh.com': 7.647789501685351e-05,\n",
445
+ " 'www.thestreet.com': 7.587663152771657e-05,\n",
446
+ " 'www.quotemaster.org': 7.578018346459137e-05,\n",
447
+ " 'pizza.dominos.com': 7.554430473414153e-05,\n",
448
+ " 'www.crikey.com.au': 7.552816722748815e-05,\n",
449
+ " 'ew.com': 7.54755717150127e-05,\n",
450
+ " 'aspe.hhs.gov': 7.542470159633038e-05,\n",
451
+ " 'en.wikibooks.org': 7.538765637665437e-05,\n",
452
+ " 'www.deseret.com': 7.517427301149898e-05,\n",
453
+ " 'www.huffingtonpost.co.uk': 7.506976706937772e-05,\n",
454
+ " 'thehill.com': 7.506696149207544e-05,\n",
455
+ " 'www.businesswire.com': 7.447800774521049e-05,\n",
456
+ " 'finance.yahoo.com': 7.432127601785517e-05,\n",
457
+ " 'www.patheos.com': 7.431243156227694e-05,\n",
458
+ " 'gahistoricnewspapers.galileo.usg.edu': 7.409300569406873e-05,\n",
459
+ " 'www.thisdaylive.com': 7.401369920079166e-05,\n",
460
+ " 'www.hindustantimes.com': 7.400497073807348e-05,\n",
461
+ " 'americanliterature.com': 7.39991203481951e-05,\n",
462
+ " 'www.govinfo.gov': 7.398683597936124e-05,\n",
463
+ " 'www.vulture.com': 7.397641837061869e-05,\n",
464
+ " 'www.verywellhealth.com': 7.367888943505635e-05,\n",
465
+ " 'edwardbetts.com': 7.347130933767997e-05,\n",
466
+ " 'www.osapublishing.org': 7.340340639245581e-05,\n",
467
+ " 'jamanetwork.com': 7.339769374409872e-05,\n",
468
+ " 'worldcrunch.com': 7.332037362854988e-05,\n",
469
+ " 'journals.biologists.com': 7.314969738455134e-05,\n",
470
+ " 'www.knowpia.com': 7.278462073189873e-05,\n",
471
+ " 'www.japantimes.co.jp': 7.270068177377371e-05,\n",
472
+ " 'casestudylion.com': 7.267251725744468e-05,\n",
473
+ " 'listverse.com': 7.263129629480715e-05,\n",
474
+ " 'www.wired.com': 7.262107079923904e-05,\n",
475
+ " 'reason.com': 7.227048600453796e-05,\n",
476
+ " 'www.pcworld.com': 7.210347803473826e-05,\n",
477
+ " 'lrb.co.uk': 7.202621229084256e-05,\n",
478
+ " 'apnews.com': 7.177186894698473e-05,\n",
479
+ " 'vault.si.com': 7.175390092800879e-05,\n",
480
+ " 'www.mirror.co.uk': 7.167631257897111e-05,\n",
481
+ " 'www.washingtontimes.com': 7.157697919345235e-05,\n",
482
+ " 'abcnews.go.com': 7.154017320905017e-05,\n",
483
+ " 'www.michigandaily.com': 7.145109069253766e-05,\n",
484
+ " 'japantoday.com': 7.141615509300366e-05,\n",
485
+ " 'www.indystar.com': 7.137189294256688e-05,\n",
486
+ " 'www.alternet.org': 7.133872260937165e-05,\n",
487
+ " 'brooklynrail.org': 7.131313530940169e-05,\n",
488
+ " 'www.angelfire.com': 7.12078065429295e-05,\n",
489
+ " 'www.cracked.com': 7.115592873627557e-05,\n",
490
+ " 'www.nj.com': 7.110599381002733e-05,\n",
491
+ " 'washingtoncitypaper.com': 7.102516853523909e-05,\n",
492
+ " 'www.lifehack.org': 7.096077437403122e-05,\n",
493
+ " 'www.cbc.ca': 7.093102945498411e-05,\n",
494
+ " 'www.etsy.com': 7.086754148799531e-05,\n",
495
+ " 'www.wnd.com': 7.084338235011462e-05,\n",
496
+ " 'thenextweb.com': 7.078859022285212e-05,\n",
497
+ " 'www.paperdue.com': 7.078620874444437e-05,\n",
498
+ " 'www.intechopen.com': 7.062377161828572e-05,\n",
499
+ " 'core-cms.prod.aop.cambridge.org': 7.005774096038663e-05,\n",
500
+ " 'www.foxbusiness.com': 6.999398113513212e-05,\n",
501
+ " 'www.freep.com': 6.994770723352898e-05,\n",
502
+ " 'www.rxlist.com': 6.978336709951025e-05,\n",
503
+ " 'variety.com': 6.975902309800886e-05,\n",
504
+ " 'news.yahoo.com': 6.969056918669951e-05,\n",
505
+ " 'thoughtcatalog.com': 6.942696816269956e-05,\n",
506
+ " 'www.afr.com': 6.928289415619634e-05,\n",
507
+ " 'www.frontstretch.com': 6.924485212287934e-05,\n",
508
+ " 'www.yahoo.com': 6.923795054770682e-05,\n",
509
+ " 'counter-currents.com': 6.914461254314151e-05,\n",
510
+ " 'www.macworld.com': 6.895888622555247e-05,\n",
511
+ " 'www.actionnetwork.com': 6.890092604330007e-05,\n",
512
+ " 'www.thenation.com': 6.889150162342162e-05,\n",
513
+ " 'supreme.justia.com': 6.858906111519188e-05,\n",
514
+ " 'www.technologyreview.com': 6.842741419039226e-05,\n",
515
+ " 'www.bbc.com': 6.82069081386749e-05,\n",
516
+ " 'www.answers.com': 6.810150325188831e-05,\n",
517
+ " 'www.tomsguide.com': 6.802705033484864e-05,\n",
518
+ " 'qz.com': 6.796972086377271e-05,\n",
519
+ " 'locations.acima.com': 6.781505888401377e-05,\n",
520
+ " 'www.pajiba.com': 6.774461134542243e-05,\n",
521
+ " 'www.slideserve.com': 6.767139447729758e-05,\n",
522
+ " 'www.cinemablend.com': 6.707308880609173e-05,\n",
523
+ " 'www.scoop.co.nz': 6.700737160132399e-05,\n",
524
+ " 'www.desmoinesregister.com': 6.696830375615098e-05,\n",
525
+ " 'nationalpost.com': 6.695414175289518e-05,\n",
526
+ " 'www.edweek.org': 6.689663466775233e-05,\n",
527
+ " 'www.yukon-news.com': 6.681017648969827e-05,\n",
528
+ " 'bmcinfectdis.biomedcentral.com': 6.676929263131023e-05,\n",
529
+ " 'hbr.org': 6.664135975623728e-05,\n",
530
+ " 'www.zdnet.com': 6.661904562978665e-05,\n",
531
+ " 'www.lifesitenews.com': 6.656624712980612e-05,\n",
532
+ " 'www.lifenews.com': 6.648742635663048e-05,\n",
533
+ " 'projects.sare.org': 6.648619393249253e-05,\n",
534
+ " 'www.aanda.org': 6.623126337478073e-05,\n",
535
+ " 'www.thenationalnews.com': 6.621839179209292e-05,\n",
536
+ " 'www.digitaltrends.com': 6.600602698923979e-05,\n",
537
+ " 'dzone.com': 6.590109332344712e-05,\n",
538
+ " 'cc.bingj.com': 6.589474996391355e-05,\n",
539
+ " 'www.cnbc.com': 6.588816736910614e-05,\n",
540
+ " 'www.today.com': 6.586893792777725e-05,\n",
541
+ " 'www.tor.com': 6.58186260247339e-05,\n",
542
+ " 'www.scotsman.com': 6.567917723352487e-05,\n",
543
+ " 'incidecoder.com': 6.564429238086714e-05,\n",
544
+ " 'nymag.com': 6.561455833615067e-05,\n",
545
+ " 'www.pbs.org': 6.54957526492523e-05,\n",
546
+ " 'www.entrepreneur.com': 6.546032408006312e-05,\n",
547
+ " 'www.aol.com': 6.534515766914859e-05,\n",
548
+ " 'deadline.com': 6.521372688439e-05,\n",
549
+ " 'www.azcentral.com': 6.515080438259391e-05,\n",
550
+ " 'chicagoreader.com': 6.509821611967221e-05,\n",
551
+ " 'www.plymouthherald.co.uk': 6.485229313249809e-05,\n",
552
+ " 'www.inman.com': 6.484517044593611e-05,\n",
553
+ " 'www.rediff.com': 6.465703727650121e-05,\n",
554
+ " 'github.com': 6.455723629476541e-05,\n",
555
+ " 'www.theepochtimes.com': 6.435335346980286e-05,\n",
556
+ " 'indianexpress.com': 6.421343345759994e-05,\n",
557
+ " 'smallbusiness.chron.com': 6.420471949398925e-05,\n",
558
+ " 'todayinsci.com': 6.418823400875574e-05,\n",
559
+ " 'www.biblegateway.com': 6.416743866381627e-05,\n",
560
+ " 'www.computerweekly.com': 6.40963241662797e-05,\n",
561
+ " 'electricscotland.com': 6.40196528857917e-05,\n",
562
+ " 'www.eurekalert.org': 6.398032768146048e-05,\n",
563
+ " 'chestofbooks.com': 6.388934940664165e-05,\n",
564
+ " 'wwd.com': 6.380502622216775e-05,\n",
565
+ " 'www.vanityfair.com': 6.37191190101989e-05,\n",
566
+ " 'www.detroitnews.com': 6.369564133037095e-05,\n",
567
+ " 'www.popsci.com': 6.354896110929367e-05,\n",
568
+ " 'email.varsitytutors.com': 6.349104442436378e-05,\n",
569
+ " 'unispal.un.org': 6.334205522041628e-05,\n",
570
+ " 'www.stylist.co.uk': 6.33394961279416e-05,\n",
571
+ " 'www.polygon.com': 6.3172749142077e-05,\n",
572
+ " 'spectator.org': 6.305600595322126e-05,\n",
573
+ " 'www.trustedreviews.com': 6.28778300458656e-05,\n",
574
+ " 'matadornetwork.com': 6.286947493516568e-05,\n",
575
+ " 'www.bizpacreview.com': 6.274636663811513e-05,\n",
576
+ " 'cloudflare-ipfs.com': 6.264667439968561e-05,\n",
577
+ " 'moderndiplomacy.eu': 6.260917608290006e-05,\n",
578
+ " 'www.cyclingnews.com': 6.254338275781793e-05,\n",
579
+ " 'www.orlandosentinel.com': 6.235571718460006e-05,\n",
580
+ " 'torontosun.com': 6.229324415513663e-05,\n",
581
+ " 'hindi.hwnews.in': 6.223899936918246e-05,\n",
582
+ " 'www.nakedcapitalism.com': 6.218519318123032e-05,\n",
583
+ " 'www.golfdigest.com': 6.213693290189896e-05,\n",
584
+ " 'www.indiatoday.in': 6.208597578857161e-05,\n",
585
+ " 'admin.rushlimbaugh.com': 6.205893857785112e-05,\n",
586
+ " 'slideplayer.com': 6.194997416017198e-05,\n",
587
+ " 'koreajoongangdaily.joins.com': 6.189866544348762e-05,\n",
588
+ " 'www.discovermagazine.com': 6.185195294388246e-05,\n",
589
+ " 'www.ncronline.org': 6.178336491582866e-05,\n",
590
+ " 'scroll.in': 6.174224182216679e-05,\n",
591
+ " 'www.elephantjournal.com': 6.153964579299534e-05,\n",
592
+ " 'www.eadt.co.uk': 6.147534950076313e-05,\n",
593
+ " 'www.mlb.com': 6.127388802675397e-05,\n",
594
+ " 'signalscv.com': 6.126671096853886e-05,\n",
595
+ " 'www.genomeweb.com': 6.11374333012448e-05,\n",
596
+ " 'sabr.org': 6.11251996792872e-05,\n",
597
+ " 'www.liverpoolecho.co.uk': 6.105908374906308e-05,\n",
598
+ " 'www.tomshardware.com': 6.105648115926588e-05,\n",
599
+ " 'blogs.lse.ac.uk': 6.087712357464855e-05,\n",
600
+ " 'lawprofessors.typepad.com': 6.073721806155311e-05,\n",
601
+ " 'www.walesonline.co.uk': 6.073342654494048e-05,\n",
602
+ " 'www.rferl.org': 6.072680770236432e-05,\n",
603
+ " 'www.treatwell.co.uk': 6.070949214322612e-05,\n",
604
+ " 'www.straitstimes.com': 6.070556650986906e-05,\n",
605
+ " 'wccftech.com': 6.061624113330584e-05,\n",
606
+ " 'genomebiology.biomedcentral.com': 6.0551513492625336e-05,\n",
607
+ " 'www.refinery29.com': 6.031577612847368e-05,\n",
608
+ " 'profiles.stanford.edu': 6.031038246048171e-05,\n",
609
+ " 'www.rollcall.com': 6.0308276465116567e-05,\n",
610
+ " 'wol.jw.org': 6.020821812422255e-05,\n",
611
+ " 'translational-medicine.biomedcentral.com': 6.0120034552375366e-05,\n",
612
+ " 'www.sun-sentinel.com': 6.006348803310472e-05,\n",
613
+ " 'www.varsitytutors.com': 6.002798696837802e-05,\n",
614
+ " 'salesandmarketingnetwork.com': 5.997694286040564e-05,\n",
615
+ " 'www.nottinghampost.com': 5.986280951090086e-05,\n",
616
+ " 'www.dailypress.com': 5.9842826115981694e-05,\n",
617
+ " 'thestuffedbakedpotatofactory.com': 5.982481459968324e-05,\n",
618
+ " 'www.catholicculture.org': 5.978192986445947e-05,\n",
619
+ " 'www.wsws.org': 5.972766695462091e-05,\n",
620
+ " 'www.irishcentral.com': 5.962983060195208e-05,\n",
621
+ " 'db0nus869y26v.cloudfront.net': 5.961873153515678e-05,\n",
622
+ " 'www.brookings.edu': 5.9409991509179976e-05,\n",
623
+ " 'www.tripadvisor.com': 5.919981969633702e-05,\n",
624
+ " 'docs.microsoft.com': 5.914151153550449e-05,\n",
625
+ " 'www.courant.com': 5.906802280911392e-05,\n",
626
+ " 'www.gearpatrol.com': 5.8947230744487336e-05,\n",
627
+ " 'www.thefreedictionary.com': 5.8906499126728096e-05,\n",
628
+ " 'www.dailyrecord.co.uk': 5.889077121986173e-05,\n",
629
+ " 'www.hulldailymail.co.uk': 5.880339234848109e-05,\n",
630
+ " 'www.mycentraljersey.com': 5.868136785971656e-05,\n",
631
+ " 'www.stokesentinel.co.uk': 5.8600274351439456e-05,\n",
632
+ " 'bookriot.com': 5.853559020808147e-05,\n",
633
+ " 'wn.rsarchive.org': 5.845390223639666e-05,\n",
634
+ " 'chicago.suntimes.com': 5.844645694469269e-05,\n",
635
+ " 'info.mzalendo.com': 5.843388621848561e-05,\n",
636
+ " 'www.manchestereveningnews.co.uk': 5.820470970048009e-05,\n",
637
+ " 'malariajournal.biomedcentral.com': 5.816658792207181e-05,\n",
638
+ " 'quizlet.com': 5.811156018431234e-05,\n",
639
+ " 'www.cyclingweekly.com': 5.804181947721189e-05,\n",
640
+ " 'www.tw.kayak.com': 5.791512627583065e-05,\n",
641
+ " 'www.opentable.com': 5.791108464961355e-05,\n",
642
+ " 'www.charityjob.co.uk': 5.780437121837461e-05,\n",
643
+ " 'thecyberwire.com': 5.769611000740947e-05,\n",
644
+ " 'bmcresnotes.biomedcentral.com': 5.769451873036076e-05,\n",
645
+ " 'dev.to': 5.768846172820043e-05,\n",
646
+ " 'newsblaze.com': 5.767688781563433e-05,\n",
647
+ " 'mmajunkie.usatoday.com': 5.767364001555314e-05,\n",
648
+ " 'villains.fandom.com': 5.756588627335069e-05,\n",
649
+ " 'ar15partspro.com': 5.756451973246832e-05,\n",
650
+ " 'www.informationweek.com': 5.746075324482982e-05,\n",
651
+ " 'www.phillymag.com': 5.737931538274948e-05,\n",
652
+ " 'www.kayak.co.uk': 5.732462837401639e-05,\n",
653
+ " 'www.mdpi.com': 5.7296170250760384e-05,\n",
654
+ " 'www.villagevoice.com': 5.728727504830589e-05,\n",
655
+ " 'www.smore.com': 5.721818680104317e-05,\n",
656
+ " 'www.outsideonline.com': 5.721286562858873e-05,\n",
657
+ " 'bgr.com': 5.711787472576781e-05,\n",
658
+ " 'www.livemint.com': 5.711578685428705e-05,\n",
659
+ " 'dir.indiamart.com': 5.711059254902328e-05,\n",
660
+ " 'timelines.ws': 5.705113170914408e-05,\n",
661
+ " 'www.medicalnewstoday.com': 5.698305477462985e-05,\n",
662
+ " 'www.gty.org': 5.697810695419367e-05,\n",
663
+ " 'anchor.fm': 5.696123724261127e-05,\n",
664
+ " 'investorplace.com': 5.6919421816566006e-05,\n",
665
+ " 'www.medpagetoday.com': 5.681609030215267e-05,\n",
666
+ " 'www.barnesandnoble.com': 5.678419951519478e-05,\n",
667
+ " 'www.foxsports.com': 5.676209562580296e-05,\n",
668
+ " 'www.wrswebsolutions.ca': 5.674841571787172e-05,\n",
669
+ " 'www.saltwire.com': 5.671346199445334e-05,\n",
670
+ " 'www.livescience.com': 5.670516125540656e-05,\n",
671
+ " 'www.myantispyware.com': 5.6697172247171146e-05,\n",
672
+ " 'www.sunnewsonline.com': 5.663415912595314e-05,\n",
673
+ " 'yourstory.com': 5.658385447246354e-05,\n",
674
+ " 'www.edp24.co.uk': 5.654794743272668e-05,\n",
675
+ " 'sports.yahoo.com': 5.648601811979471e-05,\n",
676
+ " 'www.bostonmagazine.com': 5.643611944131524e-05,\n",
677
+ " 'everything.explained.today': 5.626292397738766e-05,\n",
678
+ " 'wattsupwiththat.com': 5.6136426513957744e-05,\n",
679
+ " 'www.findlaw.com': 5.608533165910909e-05,\n",
680
+ " 'www.indiatimes.com': 5.608401586510299e-05,\n",
681
+ " 'www.businessinsider.com.au': 5.606848007141107e-05,\n",
682
+ " 'www.booking.com': 5.598988403440176e-05,\n",
683
+ " 'metro.co.uk': 5.5747183473870686e-05,\n",
684
+ " 'www.chanrobles.com': 5.571989252876885e-05,\n",
685
+ " 'jezebel.com': 5.567175549185128e-05,\n",
686
+ " 'fliphtml5.com': 5.56577602283318e-05,\n",
687
+ " 'familypedia.wikia.org': 5.564669015975151e-05,\n",
688
+ " 'thefederalist.com': 5.5615009609852446e-05,\n",
689
+ " 'www.breitbart.com': 5.558815726275264e-05,\n",
690
+ " 'www.law.cornell.edu': 5.553168686379641e-05,\n",
691
+ " 'arthritis-research.biomedcentral.com': 5.548828378547932e-05,\n",
692
+ " 'www.northjersey.com': 5.548443427243666e-05,\n",
693
+ " 'www.mailtribune.com': 5.546396515741607e-05,\n",
694
+ " 'freerepublic.com': 5.539630144746575e-05,\n",
695
+ " 'philpapers.org': 5.535732784649152e-05,\n",
696
+ " 'www.investopedia.com': 5.5335742300193024e-05,\n",
697
+ " 'groups.google.com': 5.529671795234253e-05,\n",
698
+ " 'www.poughkeepsiejournal.com': 5.527307353277827e-05,\n",
699
+ " 'www.internetnews.com': 5.525952411681458e-05,\n",
700
+ " 'dailyuknews.com': 5.517654209978492e-05,\n",
701
+ " 'www.aniapalka.pl': 5.5066240139438416e-05,\n",
702
+ " 'www.expedia.com': 5.505563766707517e-05,\n",
703
+ " 'www.greenbaypressgazette.com': 5.502396799150673e-05,\n",
704
+ " 'inews.co.uk': 5.4974982756800105e-05,\n",
705
+ " 'www.birminghammail.co.uk': 5.4962364908493626e-05,\n",
706
+ " 'doczz.net': 5.495750770747935e-05,\n",
707
+ " 'www.bookstrand.com': 5.491478971199189e-05,\n",
708
+ " 'redstate.com': 5.4876149590490285e-05,\n",
709
+ " 'literatureessaysamples.com': 5.486643881323862e-05,\n",
710
+ " 'www.runnersworld.com': 5.48382416739177e-05,\n",
711
+ " 'whyy.org': 5.4829654577497694e-05,\n",
712
+ " 'www.cfr.org': 5.481779793233525e-05,\n",
713
+ " 'www.tcrecord.org': 5.470973245932143e-05,\n",
714
+ " 'www.modernghana.com': 5.461470893350862e-05,\n",
715
+ " 'vdare.com': 5.460818433513124e-05,\n",
716
+ " 'allthetropes.org': 5.445298226361772e-05,\n",
717
+ " 'www.democratandchronicle.com': 5.441863387793769e-05,\n",
718
+ " 'www.assignmentpoint.com': 5.432789846316957e-05,\n",
719
+ " 'townhall.com': 5.429902349057279e-05,\n",
720
+ " 'thediplomat.com': 5.428771781149553e-05,\n",
721
+ " 'www.newspapers.com': 5.424167589561248e-05,\n",
722
+ " 'www.lifewire.com': 5.418477777298484e-05,\n",
723
+ " 'apk-dl.com': 5.4162264283806e-05,\n",
724
+ " 'twit.tv': 5.412131517943417e-05,\n",
725
+ " 'ng.opera.news': 5.40462967972018e-05,\n",
726
+ " 'www.theage.com.au': 5.401626552078146e-05,\n",
727
+ " 'www.sfexaminer.com': 5.390221916592171e-05,\n",
728
+ " 'www.informit.com': 5.381701516066687e-05,\n",
729
+ " 'munafa.co.com': 5.380871079684321e-05,\n",
730
+ " 'paperzz.com': 5.357548902784372e-05,\n",
731
+ " 'www.psychologytoday.com': 5.35142085499726e-05,\n",
732
+ " 'www.sandiegouniontribune.com': 5.347108820425187e-05,\n",
733
+ " 'www.mybookie.ag': 5.343941127912968e-05,\n",
734
+ " 'www.bbc.co.uk': 5.3411145269048114e-05,\n",
735
+ " 'www.leicestermercury.co.uk': 5.33733497205587e-05,\n",
736
+ " 'www.simonandschuster.com': 5.336544770696831e-05,\n",
737
+ " 'laitman.com': 5.328105202695689e-05,\n",
738
+ " 'www.startribune.com': 5.323949758484672e-05,\n",
739
+ " 'www.thedrive.com': 5.314180622325294e-05,\n",
740
+ " 'www.tennessean.com': 5.312762609611277e-05,\n",
741
+ " 'www.premiumtimesng.com': 5.306046985492513e-05,\n",
742
+ " 'www.gsmarena.com': 5.2984045059264736e-05,\n",
743
+ " 'observer.com': 5.296911822808804e-05,\n",
744
+ " 'www.jsonline.com': 5.2942515990592704e-05,\n",
745
+ " 'mronline.org': 5.2934798840623017e-05,\n",
746
+ " 'dailyreckoning.com': 5.292931092843226e-05,\n",
747
+ " 'theconversation.com': 5.291890056924346e-05,\n",
748
+ " 'arcus-www.amazon.com': 5.29151742986146e-05,\n",
749
+ " 'www.scientificamerican.com': 5.284368282428288e-05,\n",
750
+ " 'www.cornwalllive.com': 5.2795886516391694e-05,\n",
751
+ " 'www.cambridge-news.co.uk': 5.2745951590143466e-05,\n",
752
+ " 'www.wetwebmedia.com': 5.268803490521358e-05,\n",
753
+ " 'pitchfork.com': 5.266239323359047e-05,\n",
754
+ " 'www.apartmentlist.com': 5.241080834493553e-05,\n",
755
+ " 'www.washingtonian.com': 5.231334896906184e-05,\n",
756
+ " 'forward.com': 5.2309702443524256e-05,\n",
757
+ " 'content.iospress.com': 5.225874895497378e-05,\n",
758
+ " 'www.thecanary.co': 5.2257574527265855e-05,\n",
759
+ " 'www.delawareonline.com': 5.202899247266805e-05,\n",
760
+ " 'www.crosswalk.com': 5.192594369085107e-05,\n",
761
+ " 'warwick.ac.uk': 5.185486181630638e-05,\n",
762
+ " 'newatlas.com': 5.182483053988604e-05,\n",
763
+ " 'www.europeanfinancialreview.com': 5.182141962484542e-05,\n",
764
+ " 'conwebwatch.tripod.com': 5.178665438982461e-05,\n",
765
+ " 'www.devonlive.com': 5.168742612283527e-05,\n",
766
+ " 'www.valuewalk.com': 5.1608906206140365e-05,\n",
767
+ " 'www.softpanorama.org': 5.160061271664734e-05,\n",
768
+ " 'www.capitalgazette.com': 5.1599318671302486e-05,\n",
769
+ " 'www.esquire.com': 5.158403661199192e-05,\n",
770
+ " 'www.essexlive.news': 5.146875058344046e-05,\n",
771
+ " 'www.space.com': 5.145492205965729e-05,\n",
772
+ " 'pjmedia.com': 5.136386766452406e-05,\n",
773
+ " 'wwwnc-origin.cdc.gov': 5.127543760784928e-05,\n",
774
+ " 'www.dovepress.com': 5.126935160747394e-05,\n",
775
+ " 'www.al-islam.org': 5.120930717851765e-05,\n",
776
+ " 'www.derbytelegraph.co.uk': 5.118368000600204e-05,\n",
777
+ " 'www.motorbiscuit.com': 5.115163697841535e-05,\n",
778
+ " 'www.mercurynews.com': 5.1100908226031215e-05,\n",
779
+ " 'www.texasobserver.org': 5.097951807322003e-05,\n",
780
+ " 'hess.copernicus.org': 5.095490946300668e-05,\n",
781
+ " 'www.distractify.com': 5.082006413843059e-05,\n",
782
+ " 'www.2-spyware.com': 5.0796260228683775e-05,\n",
783
+ " 'reliefweb.int': 5.074167833848011e-05,\n",
784
+ " 'www.dailypost.co.uk': 5.074157321995069e-05,\n",
785
+ " 'dailycaller.com': 5.073757509105611e-05,\n",
786
+ " 'www.ipswichstar.co.uk': 5.0712806990660194e-05,\n",
787
+ " 'www.somersetlive.co.uk': 5.070674998849986e-05,\n",
788
+ " 'www.newsreview.com': 5.05868423694311e-05,\n",
789
+ " '0-bmcpublichealth-biomedcentral-com.brum.beds.ac.uk': 5.052361176160052e-05,\n",
790
+ " 'www.chroniclelive.co.uk': 5.049609245555548e-05,\n",
791
+ " 'www.bikeradar.com': 5.030546906440653e-05,\n",
792
+ " 'www.gamesradar.com': 5.025984762264113e-05,\n",
793
+ " 'www.scielo.br': 5.0237004278766544e-05,\n",
794
+ " 'en.m.wikisource.org': 5.021848166892853e-05,\n",
795
+ " 'mail.python.org': 5.018190404546956e-05,\n",
796
+ " 'www.standardmedia.co.ke': 5.01001617021316e-05,\n",
797
+ " 'bankrupt.com': 5.0096000458277584e-05,\n",
798
+ " 'nationalinterest.org': 5.007328760637054e-05,\n",
799
+ " 'betches.com': 4.9974026716389314e-05,\n",
800
+ " 'ke.opera.news': 4.984810921725961e-05,\n",
801
+ " 'hpathy.com': 4.976094420771467e-05,\n",
802
+ " 'www.knoxnews.com': 4.975990389675117e-05,\n",
803
+ " 'economictimes.indiatimes.com': 4.974863809021956e-05,\n",
804
+ " 'steelersdepot.com': 4.974135228869815e-05,\n",
805
+ " 'www.thequint.com': 4.970924763990456e-05,\n",
806
+ " 'traveltriangle.com': 4.9628802966688325e-05,\n",
807
+ " 'www.publishersweekly.com': 4.960210286021733e-05,\n",
808
+ " 'www.monstersandcritics.com': 4.9594179097965685e-05,\n",
809
+ " 'elispot.biz': 4.957056730139332e-05,\n",
810
+ " 'www.educba.com': 4.955667715640325e-05,\n",
811
+ " 'www.embedded.com': 4.954561071259984e-05,\n",
812
+ " 'hyperallergic.com': 4.949491095843071e-05,\n",
813
+ " 'www.pdf-archive.com': 4.949371478206152e-05,\n",
814
+ " 'nautilus.org': 4.947787813188887e-05,\n",
815
+ " 'bmccomplementmedtherapies.biomedcentral.com': 4.944925689367342e-05,\n",
816
+ " 'traveltips.usatoday.com': 4.9349629901227686e-05,\n",
817
+ " 'bg.copernicus.org': 4.9268152166601696e-05,\n",
818
+ " 'www.theregister.com': 4.925028201660143e-05,\n",
819
+ " 'www.rockpapershotgun.com': 4.924633825935999e-05,\n",
820
+ " 'www.autostraddle.com': 4.918608721812175e-05,\n",
821
+ " 'www.mylondon.news': 4.917486853368953e-05,\n",
822
+ " 'www.thecut.com': 4.917366148298971e-05,\n",
823
+ " 'www.sellswatches.com': 4.907603174260283e-05,\n",
824
+ " 'www.siliconrepublic.com': 4.906262731771419e-05,\n",
825
+ " 'www.pearltrees.com': 4.904416632908308e-05,\n",
826
+ " 'www.artofmanliness.com': 4.898196515788537e-05,\n",
827
+ " 'top.hatnote.com': 4.89120142137261e-05,\n",
828
+ " 'www.pastemagazine.com': 4.886283686584502e-05,\n",
829
+ " 'www.hamhigh.co.uk': 4.883906920386697e-05,\n",
830
+ " 'movieweb.com': 4.875867890230389e-05,\n",
831
+ " 'www.opednews.com': 4.8749062369251e-05,\n",
832
+ " 'www.thetruthaboutguns.com': 4.873130096255702e-05,\n",
833
+ " 'www.newsday.com': 4.872338082508225e-05,\n",
834
+ " 'theinfolist.com': 4.87136519239462e-05,\n",
835
+ " 'www.menshealth.com': 4.870897958655262e-05,\n",
836
+ " 'www.lancs.live': 4.8657065532129926e-05,\n",
837
+ " 'tribune.com.pk': 4.86490004035801e-05,\n",
838
+ " 'www.soapcentral.com': 4.85898766679504e-05,\n",
839
+ " 'www.aafp.org': 4.856965041298052e-05,\n",
840
+ " 'www.thetoptens.com': 4.855074357679362e-05,\n",
841
+ " 'www.yorkdispatch.com': 4.85267113061036e-05,\n",
842
+ " 'www.gloucestershirelive.co.uk': 4.8502167941874014e-05,\n",
843
+ " 'www.the-sun.com': 4.844362417054453e-05,\n",
844
+ " 'www.routledge.com': 4.8429161310808e-05,\n",
845
+ " 'english.pravda.ru': 4.841194362064546e-05,\n",
846
+ " 'online.flippingbook.com': 4.835702825096917e-05,\n",
847
+ " 'www.hurriyetdailynews.com': 4.828615661348331e-05,\n",
848
+ " 'newyork.cbslocal.com': 4.8281741635247945e-05,\n",
849
+ " 'www.gov.scot': 4.827646033533914e-05,\n",
850
+ " 'bmcmicrobiol.biomedcentral.com': 4.8238701034618486e-05,\n",
851
+ " 'starwars.fandom.com': 4.819857837937447e-05,\n",
852
+ " 'www.nasdaq.com': 4.818602215227488e-05,\n",
853
+ " 'www.buzzsprout.com': 4.817318319257895e-05,\n",
854
+ " 'www.sensesofcinema.com': 4.812550650232467e-05,\n",
855
+ " 'infogalactic.com': 4.811656417777078e-05,\n",
856
+ " 'www.desertsun.com': 4.807424853251659e-05,\n",
857
+ " 'www.rushlimbaugh.com': 4.806293197910871e-05,\n",
858
+ " 'www.app.com': 4.7993905353052895e-05,\n",
859
+ " 'www.tallahassee.com': 4.795652302912738e-05,\n",
860
+ " 'enhancedodds.co.uk': 4.793052612937039e-05,\n",
861
+ " 'legislature.idaho.gov': 4.789320905142864e-05,\n",
862
+ " 'www.examinerlive.co.uk': 4.7862438320525536e-05,\n",
863
+ " 'dailyfeed.dailynewsegypt.com': 4.786186560577908e-05,\n",
864
+ " 'www.wfae.org': 4.77679258882523e-05,\n",
865
+ " 'timesofindia.indiatimes.com': 4.771500414585798e-05,\n",
866
+ " 'www.msnbc.com': 4.7688260542064473e-05,\n",
867
+ " 'guardian.ng': 4.76175882673068e-05,\n",
868
+ " 'www.thewrap.com': 4.7593280513574185e-05,\n",
869
+ " 'www.hercampus.com': 4.75635319697502e-05,\n",
870
+ " 'uproxx.com': 4.745981260421109e-05,\n",
871
+ " 'www.sunsigns.org': 4.745620957599603e-05,\n",
872
+ " 'www.enr.com': 4.741904473868309e-05,\n",
873
+ " 'www.pcgamer.com': 4.731802583191685e-05,\n",
874
+ " 'www.drugs.com': 4.728373181788996e-05,\n",
875
+ " 'www.outlookindia.com': 4.726324457898498e-05,\n",
876
+ " '0-bmcplantbiol-biomedcentral-com.brum.beds.ac.uk': 4.725807564715934e-05,\n",
877
+ " 'www2.deloitte.com': 4.723304293805146e-05,\n",
878
+ " 'bible.knowing-jesus.com': 4.7134862231579375e-05,\n",
879
+ " 'www.rawstory.com': 4.704613131842386e-05,\n",
880
+ " 'www.niche.com': 4.702445515270346e-05,\n",
881
+ " 'www.courier-journal.com': 4.690736036049072e-05,\n",
882
+ " 'www.phoenixnewtimes.com': 4.6901285234446e-05,\n",
883
+ " 'news.thomasnet.com': 4.689956709020663e-05,\n",
884
+ " 'www.pulselive.co.ke': 4.689523548183942e-05,\n",
885
+ " 'legalinsurrection.com': 4.687433501837054e-05,\n",
886
+ " 'www.enotes.com': 4.6863946407843e-05,\n",
887
+ " 'www.lansingstatejournal.com': 4.684520631139241e-05,\n",
888
+ " 'www.prweb.com': 4.680034244799416e-05,\n",
889
+ " 'www.kitces.com': 4.665628656537534e-05,\n",
890
+ " 'www.churchofjesuschrist.org': 4.647538482580869e-05,\n",
891
+ " 'www.gazettelive.co.uk': 4.647152081365853e-05,\n",
892
+ " 'webstatsdomain.org': 4.644640473468249e-05,\n",
893
+ " 'bmcpregnancychildbirth.biomedcentral.com': 4.641730865069624e-05,\n",
894
+ " 'docs.google.com': 4.640923989736955e-05,\n",
895
+ " 'www.popularmechanics.com': 4.636077300575624e-05,\n",
896
+ " 'www.flavorwire.com': 4.633587803816965e-05,\n",
897
+ " 'dailymed.nlm.nih.gov': 4.632968329448802e-05,\n",
898
+ " 'www.bmj.com': 4.631275558647559e-05,\n",
899
+ " 'ca10.washburnlaw.edu': 4.63113600473782e-05,\n",
900
+ " 'interreviewed.com': 4.623577619995312e-05,\n",
901
+ " 'www.t3.com': 4.622088924132206e-05,\n",
902
+ " 'www.yorkshirepost.co.uk': 4.618057809768047e-05,\n",
903
+ " 'bestfamilypets.com': 4.617664521476966e-05,\n",
904
+ " 'www.dawn.com': 4.617599275493192e-05,\n",
905
+ " 'www.talkers.com': 4.614886130001265e-05,\n",
906
+ " 'lawandcrime.com': 4.613004870802453e-05,\n",
907
+ " 'www.mlive.com': 4.607975492886555e-05,\n",
908
+ " 'moam.info': 4.605280471279009e-05,\n",
909
+ " 'www.beliefnet.com': 4.604886095554865e-05,\n",
910
+ " 'www.yerepouni-news.com': 4.602874706866193e-05,\n",
911
+ " 'www.tbsnews.net': 4.599745074511176e-05,\n",
912
+ " 'www.getsurrey.co.uk': 4.59839267025862e-05,\n",
913
+ " 'www.rte.ie': 4.5900487963670126e-05,\n",
914
+ " 'soccer.nbcsports.com': 4.5871112771864406e-05,\n",
915
+ " 'www.usforacle.com': 4.586845218563718e-05,\n",
916
+ " 'www.gizmodo.com.au': 4.586498689894342e-05,\n",
917
+ " 'www.phnompenhpost.com': 4.57980118965996e-05,\n",
918
+ " 'www.bodybuilding.com': 4.57621882267309e-05,\n",
919
+ " 'lithub.com': 4.574725052122357e-05,\n",
920
+ " 'www.pharmtech.com': 4.567671236321033e-05,\n",
921
+ " 'www.rlpmax.ca': 4.5658856712317565e-05,\n",
922
+ " 'www.dezeen.com': 4.56107957957144e-05,\n",
923
+ " 'foreignpolicy.com': 4.560099439904082e-05,\n",
924
+ " 'sfbayview.com': 4.558094938291475e-05,\n",
925
+ " 'www.westword.com': 4.5521615410226224e-05,\n",
926
+ " 'iasbaba.com': 4.552126380686922e-05,\n",
927
+ " 'tgdaily.com': 4.551642472973933e-05,\n",
928
+ " 'www.history.navy.mil': 4.549480293567206e-05,\n",
929
+ " 'www.vcstar.com': 4.548789411094579e-05,\n",
930
+ " 'za.opera.news': 4.538702744480835e-05,\n",
931
+ " 'www.thescottishsun.co.uk': 4.538542529342902e-05,\n",
932
+ " 'www.huntspost.co.uk': 4.5374753950305125e-05,\n",
933
+ " 'www.houseofnames.com': 4.531766371450304e-05,\n",
934
+ " 'www.legal500.com': 4.528724458695694e-05,\n",
935
+ " 'www.iranicaonline.org': 4.524300056040454e-05,\n",
936
+ " 'www.lexico.com': 4.523562051468435e-05,\n",
937
+ " 'biotechnologyforbiofuels.biomedcentral.com': 4.5124981450088345e-05,\n",
938
+ " 'www.wjgnet.com': 4.511053308945932e-05,\n",
939
+ " 'www.thenewsstar.com': 4.498095094090765e-05,\n",
940
+ " 'www.mediapost.com': 4.4971530145806087e-05,\n",
941
+ " 'www.nhpr.org': 4.492473790110963e-05,\n",
942
+ " 'historicengland.org.uk': 4.490641102922295e-05,\n",
943
+ " 'ausbcomp.com': 4.484445634285284e-05,\n",
944
+ " 'byjus.com': 4.483729378374522e-05,\n",
945
+ " 'www.electricscotland.com': 4.483225171911026e-05,\n",
946
+ " 'www.thesun.ie': 4.481550524994165e-05,\n",
947
+ " 'www.dallasnews.com': 4.481432357267996e-05,\n",
948
+ " 'c-s-s-a.org': 4.4780953876756535e-05,\n",
949
+ " 'caribbean.winne.com': 4.4775538460103305e-05,\n",
950
+ " 'cherryblossomtreeimages.netlify.app': 4.47707936271722e-05,\n",
951
+ " 'www.wisdomlib.org': 4.4762260902405335e-05,\n",
952
+ " 'hv.greenspun.com': 4.475602991095494e-05,\n",
953
+ " 'www.no2nuclearpower.org.uk': 4.4755558689961015e-05,\n",
954
+ " 'slife.org': 4.459137804612485e-05,\n",
955
+ " 'motm.kicks-ass.net': 4.454331712952168e-05,\n",
956
+ " 'thenewsnigeria.com.ng': 4.453633218448101e-05,\n",
957
+ " 'www.marketwatch.com': 4.449777905762444e-05,\n",
958
+ " 'hotair.com': 4.445648197467249e-05,\n",
959
+ " 'www.sb.marketwatch.com': 4.438000643213583e-05,\n",
960
+ " 'lolcow.farm': 4.427683078312817e-05,\n",
961
+ " 'coasterbuzz.com': 4.418396399955678e-05,\n",
962
+ " 'staging.pastemagazine.com': 4.415840932257871e-05,\n",
963
+ " 'english.newsnationtv.com': 4.413182158419087e-05,\n",
964
+ " 'www.playbill.com': 4.412994032499206e-05,\n",
965
+ " 'www.kmuw.org': 4.410439289756774e-05,\n",
966
+ " 'db-engines.com': 4.409923121529586e-05,\n",
967
+ " 'comicbook.com': 4.408087534519416e-05,\n",
968
+ " 'original.newsbreak.com': 4.407597464685737e-05,\n",
969
+ " 'www.courierpostonline.com': 4.397977306855976e-05,\n",
970
+ " 'www.lovewhatmatters.com': 4.3963660935344505e-05,\n",
971
+ " 'www.9news.com.au': 4.39539102855472e-05,\n",
972
+ " 'www.emirates247.com': 4.391295030684475e-05,\n",
973
+ " 'www.ipsnews.net': 4.388738475553604e-05,\n",
974
+ " 'sparklefaerydreams.com': 4.38864568126557e-05,\n",
975
+ " 'www.windowscentral.com': 4.3863279989308495e-05,\n",
976
+ " 'grantome.com': 4.3831160841407393e-05,\n",
977
+ " 'boards.weddingbee.com': 4.3814976212654615e-05,\n",
978
+ " 'www.fda.gov': 4.379600413048394e-05,\n",
979
+ " 'www.trulia.com': 4.379217274132567e-05,\n",
980
+ " 'windowsreport.com': 4.37904980944088e-05,\n",
981
+ " 'news.wosu.org': 4.378937078880027e-05,\n",
982
+ " 'www.belfastlive.co.uk': 4.377158763344503e-05,\n",
983
+ " '0-bmcinfectdis-biomedcentral-com.brum.beds.ac.uk': 4.369082760464056e-05,\n",
984
+ " 'www.dispatch.com': 4.364714179372711e-05,\n",
985
+ " 'pagancentral.org': 4.359199081355386e-05,\n",
986
+ " 'm.egwwritings.org': 4.354498833179858e-05,\n",
987
+ " 'fivethirtyeight.com': 4.35326604656422e-05,\n",
988
+ " 'www.theautomaticearth.com': 4.351979613250813e-05,\n",
989
+ " 'www.coventrytelegraph.net': 4.350388336202108e-05,\n",
990
+ " 'www.lawgazette.co.uk': 4.344794218048879e-05,\n",
991
+ " 'www.ozy.com': 4.343802116617829e-05,\n",
992
+ " 'www.glamour.com': 4.3415333687709385e-05,\n",
993
+ " 'www.idownloadblog.com': 4.340395551309461e-05,\n",
994
+ " 'apps.shopify.com': 4.3145092072472e-05,\n",
995
+ " 'www.sheknows.com': 4.31031461544592e-05,\n",
996
+ " 'www.dailystar.co.uk': 4.309298590487486e-05,\n",
997
+ " 'www.thrillist.com': 4.305201142706491e-05,\n",
998
+ " 'athlonsports.com': 4.299827410987342e-05,\n",
999
+ " 'www.reddit.com': 4.295538212509589e-05,\n",
1000
+ " 'es.uhaul.com': 4.2948872025826016e-05,\n",
1001
+ " 'bmcpsychiatry.biomedcentral.com': 4.2947364118645467e-05,\n",
1002
+ " 'www.thestarpress.com': 4.290172092821881e-05,\n",
1003
+ " 'sportslens.com': 4.285106829614907e-05,\n",
1004
+ " 'lists.advaita-vedanta.org': 4.268204857518301e-05,\n",
1005
+ " 'www.mixonline.com': 4.264579355686603e-05,\n",
1006
+ " 'hstreasures.com': 4.262433850253508e-05,\n",
1007
+ " 'www.technologynetworks.com': 4.262179028439102e-05,\n",
1008
+ " 'bangordailynews.com': 4.2552988394501534e-05,\n",
1009
+ " 'www.complex.com': 4.2549657224552194e-05,\n",
1010
+ " 'virologyj.biomedcentral.com': 4.25204306485984e-05,\n",
1011
+ " 'news.bbc.co.uk': 4.25185167664077e-05,\n",
1012
+ " 'committees.parliament.uk': 4.250431851538315e-05,\n",
1013
+ " 'local.job-applications.com': 4.243543688040238e-05,\n",
1014
+ " 'www.mondaq.com': 4.239263189026988e-05,\n",
1015
+ " 'www.newsday.co.zw': 4.235220837854513e-05,\n",
1016
+ " 'www.independent.com': 4.231493117314903e-05,\n",
1017
+ " 'financialpost.com': 4.2311987854325456e-05,\n",
1018
+ " 'www.nydailynews.com': 4.229718064078568e-05,\n",
1019
+ " 'www.lincolnshirelive.co.uk': 4.228084014662721e-05,\n",
1020
+ " 'www.nairaland.com': 4.22537449394767e-05,\n",
1021
+ " 'www.weforum.org': 4.224628514866522e-05,\n",
1022
+ " 'www.ishn.com': 4.224242838606882e-05,\n",
1023
+ " 'phenomenica.com': 4.21905324555305e-05,\n",
1024
+ " 'collegebasketball.nbcsports.com': 4.212842552853158e-05,\n",
1025
+ " 'ncf.sobek.ufl.edu': 4.2059971617222224e-05,\n",
1026
+ " 'www.euractiv.com': 4.205985199958531e-05,\n",
1027
+ " 'www.ocregister.com': 4.204707103631939e-05,\n",
1028
+ " 'podcasts.apple.com': 4.199048464450312e-05,\n",
1029
+ " 'www.universetoday.com': 4.196350543021264e-05,\n",
1030
+ " 'www.sec.marketwatch.com': 4.19586446044215e-05,\n",
1031
+ " 'www.spokesman.com': 4.193551852795056e-05,\n",
1032
+ " 'www.fishbowlapp.com': 4.186291062234093e-05,\n",
1033
+ " 'cointelegraph.com': 4.185440689578908e-05,\n",
1034
+ " 'bluray.highdefdigest.com': 4.172987406142613e-05,\n",
1035
+ " 'www.bishop-accountability.org': 4.171900698035091e-05,\n",
1036
+ " 'wilearncap.asuscomm.com': 4.169513782462033e-05,\n",
1037
+ " 'www.gofundme.com': 4.166943090701344e-05,\n",
1038
+ " 'library.kiwix.org': 4.166776350965033e-05,\n",
1039
+ " 'www.laptopmag.com': 4.166093805479222e-05,\n",
1040
+ " 'wgday.net': 4.163025431853415e-05,\n",
1041
+ " 'punchng.com': 4.1627372620917465e-05,\n",
1042
+ " 'www.vanguardngr.com': 4.162552760948742e-05,\n",
1043
+ " 'www.oregonlive.com': 4.159901236663712e-05,\n",
1044
+ " 'philly.metro.us': 4.15494145446384e-05,\n",
1045
+ " 'elifesciences.org': 4.149664141809601e-05,\n",
1046
+ " 'www.kirkusreviews.com': 4.14384673740079e-05,\n",
1047
+ " 'pmg.org.za': 4.143711895700991e-05,\n",
1048
+ " 'www.casino.org': 4.1405554399970885e-05,\n",
1049
+ " 'www.theblaze.com': 4.134392594351965e-05,\n",
1050
+ " 'nonprofitquarterly.org': 4.131678361426974e-05,\n",
1051
+ " 'www.cbsnews.com': 4.127287669196684e-05,\n",
1052
+ " 'tvline.com': 4.1269871751936365e-05,\n",
1053
+ " 'ftw.usatoday.com': 4.1261828372047806e-05,\n",
1054
+ " 'www.presidency.ucsb.edu': 4.124738001141878e-05,\n",
1055
+ " 'www.news24.com': 4.1196886869531606e-05,\n",
1056
+ " 'www.mtv.com': 4.119597705053565e-05,\n",
1057
+ " 'freshairarchive.org': 4.119180855712788e-05,\n",
1058
+ " 'osaa.dk': 4.113290230811076e-05,\n",
1059
+ " 'bleedingcool.com': 4.106717785378927e-05,\n",
1060
+ " 'www.brainscape.com': 4.103747643206468e-05,\n",
1061
+ " 'www.sbs.com.au': 4.101813099787574e-05,\n",
1062
+ " 'forums.theregister.com': 4.093041864702248e-05,\n",
1063
+ " 'www.cleveland.com': 4.088752666224495e-05,\n",
1064
+ " 'kotaku.com': 4.085596572998281e-05,\n",
1065
+ " 'www.definitions.net': 4.0845301636412665e-05,\n",
1066
+ " 'www.evwind.es': 4.081171082909978e-05,\n",
1067
+ " 'www.wind-watch.org': 4.079068712321711e-05,\n",
1068
+ " 'www.pressconnects.com': 4.078646063337932e-05,\n",
1069
+ " 'www.nfl.com': 4.07140810887129e-05,\n",
1070
+ " 'pike.lysator.liu.se': 4.071288491234371e-05,\n",
1071
+ " 'www.getreading.co.uk': 4.070931088234366e-05,\n",
1072
+ " 'smallbiztrends.com': 4.07008071557918e-05,\n",
1073
+ " 'www.techtimes.com': 4.068088175730266e-05,\n",
1074
+ " 'blog.hubspot.com': 4.066579181116653e-05,\n",
1075
+ " 'practicaldev-herokuapp-com.global.ssl.fastly.net': 4.0643832912849765e-05,\n",
1076
+ " 'www.wbur.org': 4.060892993630765e-05,\n",
1077
+ " 'fcw.com': 4.059569950070907e-05,\n",
1078
+ " 'bmjopen.bmj.com': 4.055734936135758e-05,\n",
1079
+ " 'asia.nikkei.com': 4.054750446736148e-05,\n",
1080
+ " 'www.flsenate.gov': 4.0544376284917216e-05,\n",
1081
+ " 'www.thespruceeats.com': 4.051689685141781e-05,\n",
1082
+ " 'www.cambstimes.co.uk': 4.0463540135798334e-05,\n",
1083
+ " 'efinne1540.wordpress.com': 4.0440820034337546e-05,\n",
1084
+ " 'mb.com.ph': 4.0383961784255546e-05,\n",
1085
+ " 'www.pnj.com': 4.03558298909184e-05,\n",
1086
+ " 'friendlyatheist.patheos.com': 4.033316778588763e-05,\n",
1087
+ " 'www.jayski.com': 4.030019319064372e-05,\n",
1088
+ " 'www.un.org': 4.029648866867612e-05,\n",
1089
+ " 'www.goodhousekeeping.com': 4.028661477646501e-05,\n",
1090
+ " 'www.americanprogress.org': 4.0260081409730334e-05,\n",
1091
+ " 'www.thisisanfield.com': 4.021005948883708e-05,\n",
1092
+ " 'en-academic.com': 4.017458742232538e-05,\n",
1093
+ " 'www.ecowatch.com': 4.011691722222308e-05,\n",
1094
+ " 'm.orlandoweekly.com': 4.0110331002638805e-05,\n",
1095
+ " 'www.rogerebert.com': 4.0106738848754366e-05,\n",
1096
+ " 'www.cbpp.org': 4.007380050127922e-05,\n",
1097
+ " 'theinventory.com': 4.006688805177608e-05,\n",
1098
+ " 'www.jmir.org': 4.005055843194824e-05,\n",
1099
+ " 'www.irs.gov': 4.0033590851390175e-05,\n",
1100
+ " 'books.google.gr': 4.003322837370254e-05,\n",
1101
+ " 'simple.wikipedia.org': 4.001320873101461e-05,\n",
1102
+ " 'www.smartcompany.com.au': 3.999999279452353e-05,\n",
1103
+ " 'www.boxinginsider.com': 3.9964658469533137e-05,\n",
1104
+ " 'electrek.co': 3.996420899720047e-05,\n",
1105
+ " 'www.earlynewspaper.com': 3.995206236988791e-05,\n",
1106
+ " 'visionbib.com': 3.994921692004e-05,\n",
1107
+ " 'www.oom2.com': 3.9939150914654456e-05,\n",
1108
+ " 'www.pilotonline.com': 3.989394269745296e-05,\n",
1109
+ " 'oilprice.com': 3.988750871849748e-05,\n",
1110
+ " 'www.androidcentral.com': 3.98828218819964e-05,\n",
1111
+ " 'everything2.com': 3.988159308263532e-05,\n",
1112
+ " 'www.tvtechnology.com': 3.981694881182298e-05,\n",
1113
+ " 'jalopnik.com': 3.971168891611144e-05,\n",
1114
+ " 'fr.uhaul.com': 3.970924219171992e-05,\n",
1115
+ " 'wn.com': 3.964988284559325e-05,\n",
1116
+ " 'laist.com': 3.963267965453823e-05,\n",
1117
+ " 'www.kirchgemeinde-roeschenz.ch': 3.961209817143447e-05,\n",
1118
+ " 'www.ydr.com': 3.959250987719482e-05,\n",
1119
+ " 'www.vogue.com': 3.9586195515876265e-05,\n",
1120
+ " 'motorsports.nbcsports.com': 3.958328844482145e-05,\n",
1121
+ " 'www.coinspeaker.com': 3.956746991853318e-05,\n",
1122
+ " 'www.essence.com': 3.956449035194085e-05,\n",
1123
+ " 'www.rsvplive.ie': 3.955596850150461e-05,\n",
1124
+ " 'www.epo.org': 3.948293649700045e-05,\n",
1125
+ " 'forums.cfl.ca': 3.946960094287246e-05,\n",
1126
+ " 'spacecoastdaily.com': 3.9415624390407136e-05,\n",
1127
+ " 'britishlistedbuildings.co.uk': 3.9370361801552494e-05,\n",
1128
+ " 'books.google.co.kr': 3.932360942940168e-05,\n",
1129
+ " 'phys.org': 3.932302221554772e-05,\n",
1130
+ " 'www.outsidethebeltway.com': 3.931450036511148e-05,\n",
1131
+ " 'people.com': 3.931203551683558e-05,\n",
1132
+ " 'www.jta.org': 3.930082770673399e-05,\n",
1133
+ " 'sg.hotels.com': 3.9292255109421485e-05,\n",
1134
+ " 'ucanr.edu': 3.929047896875209e-05,\n",
1135
+ " 'www.revisor.mn.gov': 3.927835409010079e-05,\n",
1136
+ " 'english.alarabiya.net': 3.92559094716826e-05,\n",
1137
+ " 'www.marketplace.org': 3.920590567467372e-05,\n",
1138
+ " 'wearethemighty.rebelmouse.com': 3.919585416839568e-05,\n",
1139
+ " '1library.net': 3.918777091596148e-05,\n",
1140
+ " 'www.mansfieldnewsjournal.com': 3.918767667176269e-05,\n",
1141
+ " 'www.aging-us.com': 3.917408738325336e-05,\n",
1142
+ " 'diehardgamefan.com': 3.916376401870959e-05,\n",
1143
+ " 'www.rhsupplies.org': 3.914253732532184e-05,\n",
1144
+ " 'erenow.net': 3.9135585003273055e-05,\n",
1145
+ " 'owlcation.com': 3.913422208716756e-05,\n",
1146
+ " 'www.gamedeveloper.com': 3.911479690788735e-05,\n",
1147
+ " 'www.idcrawl.com': 3.904242461277468e-05,\n",
1148
+ " 'carnegieendowment.org': 3.89848740303093e-05,\n",
1149
+ " 'www.thedailyjournal.com': 3.896226629693168e-05,\n",
1150
+ " 'webot.org': 3.8944465017692056e-05,\n",
1151
+ " 'content.yudu.com': 3.892349568346253e-05,\n",
1152
+ " 'www.colts.com': 3.892286497228605e-05,\n",
1153
+ " 'access.redhat.com': 3.891852611436509e-05,\n",
1154
+ " 'wowpedia.fandom.com': 3.890621274731622e-05,\n",
1155
+ " 'www.drf.com': 3.886028319951633e-05,\n",
1156
+ " 'en.m.wikibooks.org': 3.8852707415844816e-05,\n",
1157
+ " 'support.mozilla.org': 3.882986769674711e-05,\n",
1158
+ " 'www.monexsecurities.com.au': 3.8821490837385927e-05,\n",
1159
+ " 'blog.flibo.ai': 3.876277670154324e-05,\n",
1160
+ " 'books.google.com.eg': 3.876086281935255e-05,\n",
1161
+ " 'lists.gnupg.org': 3.873693929196881e-05,\n",
1162
+ " 'www.miastenia.it': 3.87040118188243e-05,\n",
1163
+ " 'www.wuwm.com': 3.870314187237398e-05,\n",
1164
+ " 'legaltalknetwork.com': 3.8688117172221625e-05,\n",
1165
+ " 'books.google.no': 3.862479956974601e-05,\n",
1166
+ " 'marketbusinessnews.com': 3.860286966964426e-05,\n",
1167
+ " 'epjournal.net': 3.856268176841647e-05,\n",
1168
+ " 'www.mlbtraderumors.com': 3.85441374099172e-05,\n",
1169
+ " 'arstechnica.com': 3.854348495007946e-05,\n",
1170
+ " 'blueandgreentomorrow.com': 3.851208350799988e-05,\n",
1171
+ " 'www.lawnet.gov.lk': 3.849381463254321e-05,\n",
1172
+ " 'lawofselfdefense.com': 3.8400063403414e-05,\n",
1173
+ " 'www.rd.com': 3.838551717380931e-05,\n",
1174
+ " 'www.outdoorlife.com': 3.833586498015744e-05,\n",
1175
+ " 'www.packers.com': 3.833264617829127e-05,\n",
1176
+ " 'www.revolt.tv': 3.831813982123222e-05,\n",
1177
+ " 'www.firstshowing.net': 3.828763732381796e-05,\n",
1178
+ " 'www.wesa.fm': 3.8286252659051205e-05,\n",
1179
+ " 'www.ammoland.com': 3.82664287543146e-05,\n",
1180
+ " 'monetmagazine.top': 3.826308308525775e-05,\n",
1181
+ " 'www.voanews.com': 3.826214064326991e-05,\n",
1182
+ " 'mg.co.za': 3.824225149254952e-05,\n",
1183
+ " 'stemcellres.biomedcentral.com': 3.82395147860079e-05,\n",
1184
+ " 'community.atlassian.com': 3.8226657702427584e-05,\n",
1185
+ " 'www.counselling-directory.org.uk': 3.822351864565269e-05,\n",
1186
+ " 'www.nationalreview.com': 3.817521849377568e-05,\n",
1187
+ " 'www.webmd.com': 3.8168472784008847e-05,\n",
1188
+ " 'www.portsmouth.co.uk': 3.815725409957663e-05,\n",
1189
+ " 'www.montgomeryadvertiser.com': 3.813558155863309e-05,\n",
1190
+ " 'www.blogarama.com': 3.812341680743616e-05,\n",
1191
+ " 'www.usmagazine.com': 3.810199437609708e-05,\n",
1192
+ " 'www.diverseeducation.com': 3.809790200300371e-05,\n",
1193
+ " 'www.lexology.com': 3.807703053774985e-05,\n",
1194
+ " 'www.modernhealthcare.com': 3.8055782095700845e-05,\n",
1195
+ " 'amt.copernicus.org': 3.803882901425029e-05,\n",
1196
+ " 'leadership.ng': 3.798082895945224e-05,\n",
1197
+ " 'www.imore.com': 3.797297044318437e-05,\n",
1198
+ " 'electricliterature.com': 3.796756952563865e-05,\n",
1199
+ " 'community.spiceworks.com': 3.792525025560758e-05,\n",
1200
+ " 'www.consumeraffairs.com': 3.784720880946035e-05,\n",
1201
+ " 'adops.motherjones.com': 3.7835029559155904e-05,\n",
1202
+ " 'www.cbssports.com': 3.778831705955073e-05,\n",
1203
+ " 'www.mail-archive.com': 3.778817206847568e-05,\n",
1204
+ " 'www.tumbral.com': 3.7780146812471496e-05,\n",
1205
+ " 'www.thestar.co.uk': 3.7742706492115954e-05,\n",
1206
+ " 'www.tampabay.com': 3.77381682714668e-05,\n",
1207
+ " 'yourmagazine.top': 3.773286159811987e-05,\n",
1208
+ " 'www.freeadvice.com': 3.772870397904272e-05,\n",
1209
+ " 'www.macleans.ca': 3.77257824088804e-05,\n",
1210
+ " ...}"
1211
+ ]
1212
+ },
1213
+ "execution_count": 16,
1214
+ "metadata": {},
1215
+ "output_type": "execute_result"
1216
+ }
1217
+ ],
1218
+ "execution_count": 16
1219
+ },
1220
+ {
1221
+ "metadata": {
1222
+ "ExecuteTime": {
1223
+ "end_time": "2024-05-15T14:01:34.100514Z",
1224
+ "start_time": "2024-05-15T14:01:34.096130Z"
1225
+ }
1226
+ },
1227
+ "cell_type": "code",
1228
+ "source": [
1229
+ "filtered_df = df_43[df_43['in_49'] == 0] # Filter rows where 'in_49' is 0\n",
1230
+ "sorted_df = filtered_df.sort_values(by='Frequency', ascending=False) # Sort by 'Frequency' column in descending order"
1231
+ ],
1232
+ "id": "274edf9d4064ad1d",
1233
+ "outputs": [],
1234
+ "execution_count": 33
1235
+ },
1236
+ {
1237
+ "metadata": {
1238
+ "ExecuteTime": {
1239
+ "end_time": "2024-05-15T14:01:36.920220Z",
1240
+ "start_time": "2024-05-15T14:01:36.914063Z"
1241
+ }
1242
+ },
1243
+ "cell_type": "code",
1244
+ "source": "sorted_df",
1245
+ "id": "7d62dfa545a519f1",
1246
+ "outputs": [
1247
+ {
1248
+ "data": {
1249
+ "text/plain": [
1250
+ " URL Frequency in_49 change_to_49\n",
1251
+ "9 ufdc.ufl.edu 0.000443 0.0 -0.000443\n",
1252
+ "22 www.hotfreebooks.com 0.000244 0.0 -0.000244\n",
1253
+ "37 irclogs.ubuntu.com 0.000190 0.0 -0.000190\n",
1254
+ "47 transparentpng.netlify.app 0.000181 0.0 -0.000181\n",
1255
+ "85 www.preceptaustin.org 0.000120 0.0 -0.000120\n",
1256
+ "... ... ... ... ...\n",
1257
+ "59994 www.annahelizabeth.com 0.000001 0.0 -0.000001\n",
1258
+ "59996 meisendorf.com 0.000001 0.0 -0.000001\n",
1259
+ "59997 www.anyrubbish.co.uk 0.000001 0.0 -0.000001\n",
1260
+ "59998 qjshhxx.cn 0.000001 0.0 -0.000001\n",
1261
+ "59999 www.al-enterprise.com 0.000001 0.0 -0.000001\n",
1262
+ "\n",
1263
+ "[29485 rows x 4 columns]"
1264
+ ],
1265
+ "text/html": [
1266
+ "<div>\n",
1267
+ "<style scoped>\n",
1268
+ " .dataframe tbody tr th:only-of-type {\n",
1269
+ " vertical-align: middle;\n",
1270
+ " }\n",
1271
+ "\n",
1272
+ " .dataframe tbody tr th {\n",
1273
+ " vertical-align: top;\n",
1274
+ " }\n",
1275
+ "\n",
1276
+ " .dataframe thead th {\n",
1277
+ " text-align: right;\n",
1278
+ " }\n",
1279
+ "</style>\n",
1280
+ "<table border=\"1\" class=\"dataframe\">\n",
1281
+ " <thead>\n",
1282
+ " <tr style=\"text-align: right;\">\n",
1283
+ " <th></th>\n",
1284
+ " <th>URL</th>\n",
1285
+ " <th>Frequency</th>\n",
1286
+ " <th>in_49</th>\n",
1287
+ " <th>change_to_49</th>\n",
1288
+ " </tr>\n",
1289
+ " </thead>\n",
1290
+ " <tbody>\n",
1291
+ " <tr>\n",
1292
+ " <th>9</th>\n",
1293
+ " <td>ufdc.ufl.edu</td>\n",
1294
+ " <td>0.000443</td>\n",
1295
+ " <td>0.0</td>\n",
1296
+ " <td>-0.000443</td>\n",
1297
+ " </tr>\n",
1298
+ " <tr>\n",
1299
+ " <th>22</th>\n",
1300
+ " <td>www.hotfreebooks.com</td>\n",
1301
+ " <td>0.000244</td>\n",
1302
+ " <td>0.0</td>\n",
1303
+ " <td>-0.000244</td>\n",
1304
+ " </tr>\n",
1305
+ " <tr>\n",
1306
+ " <th>37</th>\n",
1307
+ " <td>irclogs.ubuntu.com</td>\n",
1308
+ " <td>0.000190</td>\n",
1309
+ " <td>0.0</td>\n",
1310
+ " <td>-0.000190</td>\n",
1311
+ " </tr>\n",
1312
+ " <tr>\n",
1313
+ " <th>47</th>\n",
1314
+ " <td>transparentpng.netlify.app</td>\n",
1315
+ " <td>0.000181</td>\n",
1316
+ " <td>0.0</td>\n",
1317
+ " <td>-0.000181</td>\n",
1318
+ " </tr>\n",
1319
+ " <tr>\n",
1320
+ " <th>85</th>\n",
1321
+ " <td>www.preceptaustin.org</td>\n",
1322
+ " <td>0.000120</td>\n",
1323
+ " <td>0.0</td>\n",
1324
+ " <td>-0.000120</td>\n",
1325
+ " </tr>\n",
1326
+ " <tr>\n",
1327
+ " <th>...</th>\n",
1328
+ " <td>...</td>\n",
1329
+ " <td>...</td>\n",
1330
+ " <td>...</td>\n",
1331
+ " <td>...</td>\n",
1332
+ " </tr>\n",
1333
+ " <tr>\n",
1334
+ " <th>59994</th>\n",
1335
+ " <td>www.annahelizabeth.com</td>\n",
1336
+ " <td>0.000001</td>\n",
1337
+ " <td>0.0</td>\n",
1338
+ " <td>-0.000001</td>\n",
1339
+ " </tr>\n",
1340
+ " <tr>\n",
1341
+ " <th>59996</th>\n",
1342
+ " <td>meisendorf.com</td>\n",
1343
+ " <td>0.000001</td>\n",
1344
+ " <td>0.0</td>\n",
1345
+ " <td>-0.000001</td>\n",
1346
+ " </tr>\n",
1347
+ " <tr>\n",
1348
+ " <th>59997</th>\n",
1349
+ " <td>www.anyrubbish.co.uk</td>\n",
1350
+ " <td>0.000001</td>\n",
1351
+ " <td>0.0</td>\n",
1352
+ " <td>-0.000001</td>\n",
1353
+ " </tr>\n",
1354
+ " <tr>\n",
1355
+ " <th>59998</th>\n",
1356
+ " <td>qjshhxx.cn</td>\n",
1357
+ " <td>0.000001</td>\n",
1358
+ " <td>0.0</td>\n",
1359
+ " <td>-0.000001</td>\n",
1360
+ " </tr>\n",
1361
+ " <tr>\n",
1362
+ " <th>59999</th>\n",
1363
+ " <td>www.al-enterprise.com</td>\n",
1364
+ " <td>0.000001</td>\n",
1365
+ " <td>0.0</td>\n",
1366
+ " <td>-0.000001</td>\n",
1367
+ " </tr>\n",
1368
+ " </tbody>\n",
1369
+ "</table>\n",
1370
+ "<p>29485 rows × 4 columns</p>\n",
1371
+ "</div>"
1372
+ ]
1373
+ },
1374
+ "execution_count": 34,
1375
+ "metadata": {},
1376
+ "output_type": "execute_result"
1377
+ }
1378
+ ],
1379
+ "execution_count": 34
1380
+ },
1381
+ {
1382
+ "metadata": {
1383
+ "ExecuteTime": {
1384
+ "end_time": "2024-05-15T14:03:08.719028Z",
1385
+ "start_time": "2024-05-15T14:03:08.082516Z"
1386
+ }
1387
+ },
1388
+ "cell_type": "code",
1389
+ "source": "assert all(row[1][\"URL\"] not in freqs_49 for row in sorted_df.iterrows())",
1390
+ "id": "3a2317033c481119",
1391
+ "outputs": [],
1392
+ "execution_count": 36
1393
+ },
1394
+ {
1395
+ "metadata": {},
1396
+ "cell_type": "code",
1397
+ "outputs": [],
1398
+ "execution_count": null,
1399
+ "source": "",
1400
+ "id": "d67bd99d6e230caf"
1401
+ }
1402
+ ],
1403
+ "metadata": {
1404
+ "kernelspec": {
1405
+ "display_name": "Python 3",
1406
+ "language": "python",
1407
+ "name": "python3"
1408
+ },
1409
+ "language_info": {
1410
+ "codemirror_mode": {
1411
+ "name": "ipython",
1412
+ "version": 2
1413
+ },
1414
+ "file_extension": ".py",
1415
+ "mimetype": "text/x-python",
1416
+ "name": "python",
1417
+ "nbconvert_exporter": "python",
1418
+ "pygments_lexer": "ipython2",
1419
+ "version": "2.7.6"
1420
+ }
1421
+ },
1422
+ "nbformat": 4,
1423
+ "nbformat_minor": 5
1424
+ }
notebooks/create_graphs_for_blog.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/loubna-ablations_faq_metrics.csv ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ runname,seed,steps,agg_score,commonsense_qa/acc,commonsense_qa/acc_norm,hellaswag/acc,hellaswag/acc_norm,openbookqa/acc,openbookqa/acc_norm,piqa/acc,piqa/acc_norm,siqa/acc,siqa/acc_norm,winogrande/acc,winogrande/acc_norm,all/acc,all/acc_norm,arc/acc,arc/acc_norm,mmlu/acc,mmlu/acc_norm
2
+ filtered_web_min_score_4_fix-seed-1,0,1000,0.3682507313787937,0.2540000081062317,0.2709999978542328,0.2800000011920929,0.2840000092983246,0.13600000739097595,0.3059999942779541,0.5759999752044678,0.5789999961853027,0.35100001096725464,0.38100001215934753,0.4909999966621399,0.503000020980835,0.26319819688796997,0.2825128138065338,0.34049999713897705,0.3529999852180481,0.2515593469142914,0.26900583505630493
3
+ filtered_web_min_score_4_fix-seed-1,0,2000,0.38916464149951935,0.2980000078678131,0.27900001406669617,0.3009999990463257,0.3240000009536743,0.18799999356269836,0.32600000500679016,0.6010000109672546,0.6079999804496765,0.36000001430511475,0.37599998712539673,0.5019999742507935,0.503000020980835,0.2754783630371094,0.2941242456436157,0.41100001335144043,0.4189999997615814,0.26024726033210754,0.2783171236515045
4
+ filtered_web_min_score_4_fix-seed-1,0,3000,0.39035574346780777,0.2840000092983246,0.27900001406669617,0.3050000071525574,0.3269999921321869,0.19200000166893005,0.32199999690055847,0.6150000095367432,0.6200000047683716,0.3619999885559082,0.38199999928474426,0.5149999856948853,0.49000000953674316,0.2794281244277954,0.2956494987010956,0.4259999990463257,0.4230000078678131,0.263821542263031,0.2798459231853485
5
+ filtered_web_min_score_4_fix-seed-1,0,4000,0.4036811888217926,0.3109999895095825,0.28600001335144043,0.31299999356269836,0.3440000116825104,0.21199999749660492,0.3240000009536743,0.6200000047683716,0.6439999938011169,0.35600000619888306,0.38199999928474426,0.5120000243186951,0.5019999742507935,0.29403528571128845,0.3073711395263672,0.44749999046325684,0.45649999380111694,0.2788296937942505,0.2909495234489441
6
+ filtered_web_min_score_4_fix-seed-1,0,5000,0.4085462875664234,0.3050000071525574,0.2930000126361847,0.32499998807907104,0.35499998927116394,0.22200000286102295,0.3540000021457672,0.628000020980835,0.6359999775886536,0.36399999260902405,0.3880000114440918,0.5180000066757202,0.49799999594688416,0.2961043119430542,0.31081706285476685,0.4544999897480011,0.44999998807907104,0.2802768647670746,0.2943703234195709
7
+ filtered_web_min_score_4_fix-seed-1,0,6000,0.41010819375514984,0.30000001192092896,0.289000004529953,0.3330000042915344,0.35600000619888306,0.20600000023841858,0.3440000116825104,0.6259999871253967,0.640999972820282,0.3720000088214874,0.38199999928474426,0.5379999876022339,0.5019999742507935,0.30340614914894104,0.3206590712070465,0.4625000059604645,0.46149998903274536,0.28809472918510437,0.30536559224128723
8
+ filtered_web_min_score_4_fix-seed-1,0,7000,0.4124617166817188,0.30399999022483826,0.2849999964237213,0.33799999952316284,0.36800000071525574,0.2199999988079071,0.3479999899864197,0.6269999742507935,0.6380000114440918,0.37299999594688416,0.382999986410141,0.5389999747276306,0.5049999952316284,0.30451327562332153,0.3200468420982361,0.47099998593330383,0.4684999883174896,0.2886028587818146,0.30419376492500305
9
+ filtered_web_min_score_4_fix-seed-1,0,8000,0.41384265199303627,0.3199999928474426,0.2840000092983246,0.3330000042915344,0.3720000088214874,0.21400000154972076,0.3540000021457672,0.6309999823570251,0.6399999856948853,0.36500000953674316,0.3779999911785126,0.5370000004768372,0.5080000162124634,0.30659323930740356,0.32282689213752747,0.4650000035762787,0.4675000011920929,0.2912028133869171,0.30724120140075684
10
+ fineweb_2B_educational_minimum_score_3-seed-0,0,1000,0.3635076731443405,0.2549999952316284,0.25999999046325684,0.2809999883174896,0.2849999964237213,0.1420000046491623,0.2759999930858612,0.5899999737739563,0.6209999918937683,0.36500000953674316,0.37599998712539673,0.503000020980835,0.48899999260902405,0.26151496171951294,0.27996155619621277,0.31299999356269836,0.33399999141693115,0.2497626692056656,0.2670614421367645
11
+ fineweb_2B_educational_minimum_score_3-seed-0,0,2000,0.383779339492321,0.28299999237060547,0.2800000011920929,0.3089999854564667,0.3330000042915344,0.164000004529953,0.3100000023841858,0.6230000257492065,0.6399999856948853,0.3580000102519989,0.3709999918937683,0.5040000081062317,0.49399998784065247,0.26660147309303284,0.28831353783607483,0.3634999990463257,0.36899998784065247,0.2519490420818329,0.2732347548007965
12
+ fineweb_2B_educational_minimum_score_3-seed-0,0,4000,0.4050140306353569,0.3089999854564667,0.29499998688697815,0.34299999475479126,0.3790000081062317,0.20800000429153442,0.328000009059906,0.6570000052452087,0.675000011920929,0.36399999260902405,0.3840000033378601,0.5080000162124634,0.4950000047683716,0.28317055106163025,0.3020445704460144,0.40049999952316284,0.398499995470047,0.2669488787651062,0.28561222553253174
13
+ fineweb_2B_educational_minimum_score_3-seed-0,0,6000,0.41626306250691414,0.33399999141693115,0.3140000104904175,0.35600000619888306,0.4189999997615814,0.21400000154972076,0.3400000035762787,0.6650000214576721,0.6800000071525574,0.3580000102519989,0.3880000114440918,0.5040000081062317,0.4869999885559082,0.2945695221424103,0.31046855449676514,0.421999990940094,0.40849998593330383,0.278456449508667,0.2936044931411743
14
+ fineweb_2B_educational_minimum_score_3-seed-0,0,8000,0.42167554423213005,0.33899998664855957,0.3140000104904175,0.3659999966621399,0.4390000104904175,0.21400000154972076,0.3400000035762787,0.6489999890327454,0.6819999814033508,0.3709999918937683,0.38999998569488525,0.4970000088214874,0.4869999885559082,0.3004492521286011,0.31168535351753235,0.4350000023841858,0.42750000953674316,0.28461754322052,0.2939043641090393
15
+ fineweb_2B_educational_regression-seed-6,0,1000,0.3628821112215519,0.24899999797344208,0.25999999046325684,0.28700000047683716,0.30300000309944153,0.1379999965429306,0.2720000147819519,0.6029999852180481,0.625,0.36000001430511475,0.38499999046325684,0.5120000243186951,0.4970000088214874,0.2574775218963623,0.2720729410648346,0.29350000619888306,0.30250000953674316,0.24561470746994019,0.25855687260627747
16
+ fineweb_2B_educational_regression-seed-6,0,2000,0.3791401833295822,0.27900001406669617,0.2639999985694885,0.3149999976158142,0.3230000138282776,0.15800000727176666,0.2919999957084656,0.6299999952316284,0.6520000100135803,0.3569999933242798,0.3880000114440918,0.5180000066757202,0.4970000088214874,0.26788005232810974,0.28260648250579834,0.3255000114440918,0.34950000047683716,0.2544597089290619,0.2676214277744293
17
+ fineweb_2B_educational_regression-seed-6,0,4000,0.4023119993507862,0.3190000057220459,0.3050000071525574,0.3490000069141388,0.38600000739097595,0.17399999499320984,0.3179999887943268,0.6549999713897705,0.6800000071525574,0.36399999260902405,0.382999986410141,0.5080000162124634,0.5,0.27732229232788086,0.29341956973075867,0.37049999833106995,0.3700000047683716,0.26168331503868103,0.2764959931373596
18
+ fineweb_2B_educational_regression-seed-6,0,6000,0.41444139182567596,0.3310000002384186,0.3149999976158142,0.3580000102519989,0.4180000126361847,0.1860000044107437,0.33799999952316284,0.6729999780654907,0.6990000009536743,0.3700000047683716,0.3790000081062317,0.5199999809265137,0.4950000047683716,0.28314903378486633,0.29828882217407227,0.3855000138282776,0.39149999618530273,0.26659101247787476,0.28003111481666565
19
+ fineweb_2B_educational_regression-seed-6,0,8000,0.41904011741280556,0.31299999356269836,0.31700000166893005,0.36800000071525574,0.421999990940094,0.18000000715255737,0.3499999940395355,0.6909999847412109,0.6970000267028809,0.3630000054836273,0.3799999952316284,0.5329999923706055,0.5139999985694885,0.28121474385261536,0.30207374691963196,0.3889999985694885,0.38850000500679016,0.2640869915485382,0.2838209271430969
20
+ fineweb_2024_10_all_2B-seed-6,0,1000,0.35314396768808365,0.2199999988079071,0.24300000071525574,0.2919999957084656,0.30799999833106995,0.1340000033378601,0.257999986410141,0.5799999833106995,0.5870000123977661,0.3529999852180481,0.38499999046325684,0.4909999966621399,0.4869999885559082,0.2508137822151184,0.2687792479991913,0.27649998664855957,0.3009999990463257,0.23999817669391632,0.2561517655849457
21
+ fineweb_2024_10_all_2B-seed-6,0,2000,0.37468964606523514,0.2759999930858612,0.2759999930858612,0.3050000071525574,0.32499998807907104,0.13199999928474426,0.2720000147819519,0.628000020980835,0.6520000100135803,0.3540000021457672,0.3889999985694885,0.5040000081062317,0.49799999594688416,0.2582138776779175,0.27513813972473145,0.3070000112056732,0.3255000114440918,0.24510353803634644,0.26001715660095215
22
+ fineweb_2024_10_all_2B-seed-6,0,3000,0.38586894795298576,0.2669999897480011,0.2809999883174896,0.3269999921321869,0.35899999737739563,0.17800000309944153,0.2879999876022339,0.6420000195503235,0.6769999861717224,0.3709999918937683,0.3880000114440918,0.5099999904632568,0.49799999594688416,0.26085224747657776,0.27577292919158936,0.32249999046325684,0.3370000123977661,0.24588415026664734,0.2589516043663025
23
+ fineweb_2024_10_all_2B-seed-6,0,4000,0.38998349756002426,0.2750000059604645,0.2809999883174896,0.35199999809265137,0.382999986410141,0.15199999511241913,0.28600001335144043,0.6480000019073486,0.6840000152587891,0.36500000953674316,0.38499999046325684,0.5049999952316284,0.49300000071525574,0.26505371928215027,0.2810457646846771,0.3264999985694885,0.3434999883174896,0.2504998445510864,0.2643679976463318
24
+ fineweb_2024_10_all_2B-seed-6,0,5000,0.3979869969189167,0.30300000309944153,0.296999990940094,0.3490000069141388,0.3970000147819519,0.15399999916553497,0.28999999165534973,0.6690000295639038,0.6940000057220459,0.375,0.382999986410141,0.5090000033378601,0.5019999742507935,0.26854822039604187,0.2826780378818512,0.33399999141693115,0.35600000619888306,0.25313395261764526,0.2648960053920746
25
+ fineweb_2024_10_all_2B-seed-6,0,6000,0.4039541743695736,0.31700000166893005,0.3190000057220459,0.35899999737739563,0.41600000858306885,0.16599999368190765,0.2840000092983246,0.6600000262260437,0.6949999928474426,0.3790000081062317,0.4000000059604645,0.515999972820282,0.49000000953674316,0.26819688081741333,0.2866784930229187,0.3330000042915344,0.35899999737739563,0.25210171937942505,0.2686333656311035
26
+ fineweb_2024_10_all_2B-seed-6,0,7000,0.4048592820763588,0.2980000078678131,0.3100000023841858,0.367000013589859,0.42399999499320984,0.17599999904632568,0.28999999165534973,0.6729999780654907,0.7020000219345093,0.38199999928474426,0.3959999978542328,0.5109999775886536,0.49399998784065247,0.27170079946517944,0.2894589602947235,0.32499998807907104,0.35100001096725464,0.25620266795158386,0.27187424898147583
27
+ fineweb_2024_10_all_2B-seed-6,0,8000,0.40328324213624,0.33000001311302185,0.3190000057220459,0.36399999260902405,0.41200000047683716,0.17599999904632568,0.2759999930858612,0.6579999923706055,0.703000009059906,0.382999986410141,0.40299999713897705,0.5099999904632568,0.49300000071525574,0.2675325274467468,0.2870177924633026,0.3294999897480011,0.35100001096725464,0.2510458528995514,0.26926591992378235
notebooks/loubna-edu_fw_ablations_metrics.csv ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ runname,seed,steps,agg_score,commonsense_qa/acc,commonsense_qa/acc_norm,hellaswag/acc,hellaswag/acc_norm,openbookqa/acc,openbookqa/acc_norm,piqa/acc,piqa/acc_norm,siqa/acc,siqa/acc_norm,winogrande/acc,winogrande/acc_norm,all/acc,all/acc_norm,arc/acc,arc/acc_norm,mmlu/acc,mmlu/acc_norm
2
+ edu_fineweb_350b_tokens-seed-1,0,2000,0.3903256542980671,0.2840000092983246,0.28299999237060547,0.3140000104904175,0.32499998807907104,0.164000004529953,0.29600000381469727,0.6230000257492065,0.6320000290870667,0.3619999885559082,0.4059999883174896,0.5109999775886536,0.5109999775886536,0.2796742022037506,0.29916155338287354,0.37950000166893005,0.38499999046325684,0.26599687337875366,0.2846052646636963
3
+ edu_fineweb_350b_tokens-seed-1,0,4000,0.4146799184381962,0.32199999690055847,0.3070000112056732,0.34299999475479126,0.39500001072883606,0.19599999487400055,0.3199999928474426,0.656000018119812,0.6880000233650208,0.3709999918937683,0.3880000114440918,0.5180000066757202,0.4950000047683716,0.29061299562454224,0.31259292364120483,0.42149999737739563,0.4284999966621399,0.2744007706642151,0.29593929648399353
4
+ edu_fineweb_350b_tokens-seed-1,0,6000,0.42839035764336586,0.3190000057220459,0.3109999895095825,0.3720000088214874,0.4309999942779541,0.20200000703334808,0.35199999809265137,0.6600000262260437,0.6700000166893005,0.37299999594688416,0.3919999897480011,0.5199999809265137,0.5189999938011169,0.303979754447937,0.323323130607605,0.43149998784065247,0.44600000977516174,0.2885909378528595,0.30612286925315857
5
+ edu_fineweb_350b_tokens-seed-1,0,8000,0.4436151087284088,0.3400000035762787,0.3109999895095825,0.3790000081062317,0.46299999952316284,0.20399999618530273,0.36000001430511475,0.6809999942779541,0.699999988079071,0.3840000033378601,0.40400001406669617,0.5170000195503235,0.5170000195503235,0.3151480555534363,0.3332844376564026,0.46299999952316284,0.4790000021457672,0.2991863787174225,0.3149208426475525
6
+ edu_fineweb_350b_tokens-seed-1,0,10000,0.4414566531777382,0.34599998593330383,0.31700000166893005,0.38999998569488525,0.45399999618530273,0.22200000286102295,0.36399999260902405,0.6899999976158142,0.6959999799728394,0.3659999966621399,0.39500001072883606,0.5139999985694885,0.5059999823570251,0.3189352750778198,0.33541902899742126,0.48899999260902405,0.4819999933242798,0.30218935012817383,0.31765326857566833
7
+ edu_fineweb_350b_tokens-seed-1,0,12000,0.4507521316409111,0.36800000071525574,0.33500000834465027,0.39399999380111694,0.4699999988079071,0.23600000143051147,0.3659999966621399,0.6959999799728394,0.7020000219345093,0.3700000047683716,0.39899998903274536,0.5389999747276306,0.5320000052452087,0.32298433780670166,0.3407149612903595,0.47850000858306885,0.4794999957084656,0.3058593273162842,0.3225170373916626
8
+ edu_fineweb_350b_tokens-seed-1,0,14000,0.4495235048234463,0.3499999940395355,0.3269999921321869,0.3959999978542328,0.4749999940395355,0.2280000001192093,0.37599998712539673,0.6890000104904175,0.7009999752044678,0.3779999911785126,0.40400001406669617,0.5389999747276306,0.5220000147819519,0.3227299153804779,0.33800336718559265,0.4740000069141388,0.4715000092983246,0.306130588054657,0.3196880519390106
9
+ edu_fineweb_350b_tokens-seed-1,0,16000,0.45921100676059723,0.3700000047683716,0.3330000042915344,0.4020000100135803,0.4690000116825104,0.25999999046325684,0.38600000739097595,0.699999988079071,0.7279999852180481,0.38100001215934753,0.4129999876022339,0.5350000262260437,0.5260000228881836,0.32965603470802307,0.35061872005462646,0.5005000233650208,0.4860000014305115,0.31190600991249084,0.33268803358078003
10
+ edu_fineweb_350b_tokens-seed-1,0,18000,0.46205566823482513,0.3709999918937683,0.33899998664855957,0.4009999930858612,0.49799999594688416,0.2280000001192093,0.3779999911785126,0.6890000104904175,0.7160000205039978,0.38199999928474426,0.41499999165534973,0.5339999794960022,0.5210000276565552,0.33332228660583496,0.3538905084133148,0.4970000088214874,0.4934999942779541,0.3169640302658081,0.33594533801078796
11
+ edu_fineweb_350b_tokens-seed-1,0,20000,0.4671029560267925,0.3919999897480011,0.3490000069141388,0.4129999876022339,0.49399998784065247,0.25600001215934753,0.3799999952316284,0.7070000171661377,0.7319999933242798,0.37400001287460327,0.4059999883174896,0.5249999761581421,0.5289999842643738,0.3345783054828644,0.35636845231056213,0.5044999718666077,0.5090000033378601,0.3170454502105713,0.33782368898391724
12
+ edu_fineweb_350b_tokens-seed-1,0,22000,0.463470172137022,0.3889999985694885,0.3310000002384186,0.4169999957084656,0.5,0.25200000405311584,0.37599998712539673,0.6899999976158142,0.7269999980926514,0.3840000033378601,0.4020000100135803,0.5550000071525574,0.5400000214576721,0.32882845401763916,0.3514060974121094,0.4909999966621399,0.49900001287460327,0.31061139702796936,0.33276134729385376
13
+ edu_fineweb_350b_tokens-seed-1,0,24000,0.4662625528872013,0.39800000190734863,0.3409999907016754,0.41200000047683716,0.515999972820282,0.25,0.3700000047683716,0.6949999928474426,0.7329999804496765,0.382999986410141,0.3959999978542328,0.5509999990463257,0.5400000214576721,0.3372650444507599,0.3554573357105255,0.5095000267028809,0.4970000088214874,0.3195478320121765,0.33710044622421265
14
+ edu_fineweb_350b_tokens-seed-1,0,26000,0.4668670482933521,0.3930000066757202,0.32899999618530273,0.421999990940094,0.5180000066757202,0.2619999945163727,0.3840000033378601,0.7049999833106995,0.7239999771118164,0.39100000262260437,0.3930000066757202,0.5550000071525574,0.5360000133514404,0.33861491084098816,0.3594595789909363,0.5105000138282776,0.5095000267028809,0.3203679025173187,0.34143635630607605
15
+ edu_fineweb_350b_tokens-seed-1,0,28000,0.4710822217166424,0.4129999876022339,0.3490000069141388,0.4189999997615814,0.5239999890327454,0.2460000067949295,0.37599998712539673,0.7129999995231628,0.7360000014305115,0.3889999985694885,0.40799999237060547,0.5550000071525574,0.5410000085830688,0.33095699548721313,0.35399219393730164,0.4975000023841858,0.5,0.31196850538253784,0.33465778827667236
16
+ edu_fineweb_350b_tokens-seed-1,0,30000,0.4734017364680767,0.40400001406669617,0.3499999940395355,0.4269999861717224,0.5189999938011169,0.2460000067949295,0.3779999911785126,0.7139999866485596,0.7409999966621399,0.3970000147819519,0.4050000011920929,0.5550000071525574,0.5379999876022339,0.34207814931869507,0.36481064558029175,0.512499988079071,0.5095000267028809,0.3239838480949402,0.3467139005661011
17
+ edu_fineweb_350b_tokens-seed-1,0,32000,0.4748654440045357,0.4050000011920929,0.3499999940395355,0.42500001192092896,0.5329999923706055,0.2540000081062317,0.39800000190734863,0.699999988079071,0.7260000109672546,0.3779999911785126,0.40299999713897705,0.5609999895095825,0.5370000004768372,0.3364258408546448,0.36298680305480957,0.5120000243186951,0.5074999928474426,0.3179066479206085,0.3444235622882843
18
+ edu_fineweb_350b_tokens-seed-1,0,34000,0.4766022339463234,0.39500001072883606,0.3529999852180481,0.4230000078678131,0.5329999923706055,0.24199999868869781,0.3799999952316284,0.6970000267028809,0.7390000224113464,0.38999998569488525,0.4099999964237213,0.5490000247955322,0.5400000214576721,0.3360733985900879,0.35897108912467957,0.503000020980835,0.5184999704360962,0.3182942271232605,0.3393178880214691
19
+ edu_fineweb_350b_tokens-seed-1,0,36000,0.4756137728691101,0.4050000011920929,0.3619999885559082,0.42800000309944153,0.5320000052452087,0.2639999985694885,0.38199999928474426,0.7089999914169312,0.7409999966621399,0.3869999945163727,0.39800000190734863,0.5540000200271606,0.5440000295639038,0.3384101688861847,0.3646673560142517,0.5059999823570251,0.49950000643730164,0.31995898485183716,0.3464101552963257
20
+ edu_fineweb_350b_tokens-seed-1,0,38000,0.4826180450618267,0.4059999883174896,0.375,0.4230000078678131,0.5289999842643738,0.2619999945163727,0.39399999380111694,0.7080000042915344,0.7400000095367432,0.41100001335144043,0.4000000059604645,0.5590000152587891,0.5479999780654907,0.34553104639053345,0.37065890431404114,0.5239999890327454,0.5230000019073486,0.3270617127418518,0.3519443869590759
21
+ edu_fineweb_350b_tokens-seed-1,0,40000,0.4801179841160774,0.40299999713897705,0.35499998927116394,0.4300000071525574,0.5400000214576721,0.2619999945163727,0.39800000190734863,0.7049999833106995,0.746999979019165,0.4020000100135803,0.3970000147819519,0.5649999976158142,0.5540000200271606,0.33888015151023865,0.36192768812179565,0.5145000219345093,0.5074999928474426,0.3198457956314087,0.3424438536167145
22
+ edu_fineweb_350b_tokens-seed-1,0,42000,0.47703278809785843,0.4050000011920929,0.367000013589859,0.4269999861717224,0.5350000262260437,0.2540000081062317,0.3720000088214874,0.703000009059906,0.734000027179718,0.40299999713897705,0.39899998903274536,0.5460000038146973,0.5339999794960022,0.34130677580833435,0.3668607473373413,0.5264999866485596,0.5270000100135803,0.32270070910453796,0.3482622504234314
23
+ edu_fineweb_350b_tokens-seed-1,0,44000,0.4772367440164089,0.4090000092983246,0.3490000069141388,0.43299999833106995,0.5370000004768372,0.25,0.3880000114440918,0.7110000252723694,0.7360000014305115,0.3919999897480011,0.3959999978542328,0.5490000247955322,0.5479999780654907,0.34139013290405273,0.3680916130542755,0.5105000138282776,0.5139999985694885,0.3232519030570984,0.3498939573764801
24
+ edu_fineweb_350b_tokens-seed-1,0,46000,0.4836798347532749,0.4099999964237213,0.36500000953674316,0.4230000078678131,0.5450000166893005,0.25600001215934753,0.38999998569488525,0.7120000123977661,0.746999979019165,0.38999998569488525,0.39899998903274536,0.5550000071525574,0.5490000247955322,0.34736892580986023,0.36823850870132446,0.5299999713897705,0.5254999995231628,0.3293505311012268,0.3489386737346649
25
+ edu_fineweb_350b_tokens-seed-1,0,48000,0.47607188299298286,0.40400001406669617,0.3630000054836273,0.4350000023841858,0.5389999747276306,0.25,0.37599998712539673,0.7170000076293945,0.7409999966621399,0.3919999897480011,0.4000000059604645,0.5540000200271606,0.5429999828338623,0.33596572279930115,0.35683509707450867,0.5120000243186951,0.5095000267028809,0.3168731927871704,0.3370750844478607
26
+ edu_fineweb_350b_tokens-seed-1,0,50000,0.47911832481622696,0.4099999964237213,0.3499999940395355,0.4300000071525574,0.5479999780654907,0.25200000405311584,0.37599998712539673,0.7099999785423279,0.7419999837875366,0.3880000114440918,0.3869999945163727,0.5490000247955322,0.5440000295639038,0.3498813509941101,0.3725147545337677,0.531499981880188,0.531499981880188,0.3322857618331909,0.3544466495513916
27
+ edu_fineweb_350b_tokens-seed-1,0,52000,0.48371143266558647,0.41499999165534973,0.375,0.4350000023841858,0.5540000200271606,0.25600001215934753,0.39399999380111694,0.7179999947547913,0.7459999918937683,0.3880000114440918,0.3959999978542328,0.5600000023841858,0.5360000133514404,0.34429991245269775,0.3713294565677643,0.5199999809265137,0.515999972820282,0.32574552297592163,0.3526914715766907
28
+ edu_fineweb_350b_tokens-seed-1,0,54000,0.4845060631632805,0.41999998688697815,0.367000013589859,0.4410000145435333,0.5460000038146973,0.2460000067949295,0.3700000047683716,0.7049999833106995,0.7459999918937683,0.39899998903274536,0.40400001406669617,0.5569999814033508,0.5569999814033508,0.3471878468990326,0.3677656352519989,0.534500002861023,0.5379999876022339,0.3286001682281494,0.34804850816726685
29
+ edu_fineweb_350b_tokens-seed-1,0,56000,0.48096122220158577,0.40400001406669617,0.3580000102519989,0.4399999976158142,0.5389999747276306,0.27000001072883606,0.40400001406669617,0.7110000252723694,0.7360000014305115,0.38999998569488525,0.39100000262260437,0.5619999766349792,0.5490000247955322,0.354465126991272,0.37144333124160767,0.5195000171661377,0.5174999833106995,0.3372672498226166,0.35318976640701294
30
+ edu_fineweb_350b_tokens-seed-1,0,58000,0.48549820110201836,0.4090000092983246,0.3619999885559082,0.44200000166893005,0.5460000038146973,0.25,0.38999998569488525,0.7279999852180481,0.7480000257492065,0.39100000262260437,0.4020000100135803,0.5619999766349792,0.5490000247955322,0.351541131734848,0.3733488619327545,0.5299999713897705,0.5325000286102295,0.333476722240448,0.35448554158210754
31
+ edu_fineweb_350b_tokens-seed-1,0,60000,0.4874906800687313,0.41100001335144043,0.36399999260902405,0.4350000023841858,0.5519999861717224,0.2639999985694885,0.3919999897480011,0.718999981880188,0.7509999871253967,0.39800000190734863,0.40400001406669617,0.5730000138282776,0.5429999828338623,0.34932613372802734,0.37534230947494507,0.5385000109672546,0.5375000238418579,0.3303368091583252,0.3564254641532898
32
+ edu_fineweb_350b_tokens-seed-1,0,62000,0.48836689814925194,0.4090000092983246,0.36399999260902405,0.4359999895095825,0.5580000281333923,0.27000001072883606,0.4020000100135803,0.7260000109672546,0.7519999742507935,0.40799999237060547,0.39500001072883606,0.5669999718666077,0.550000011920929,0.34964632987976074,0.37406620383262634,0.5389999747276306,0.531000018119812,0.33040371537208557,0.3549351394176483
33
+ edu_fineweb_350b_tokens-seed-1,0,64000,0.48763588443398476,0.4359999895095825,0.3580000102519989,0.44200000166893005,0.5600000023841858,0.24400000274181366,0.37400001287460327,0.7289999723434448,0.7519999742507935,0.39399999380111694,0.4050000011920929,0.5659999847412109,0.5490000247955322,0.3522980511188507,0.37986865639686584,0.5414999723434448,0.5414999723434448,0.3334276080131531,0.3615870773792267
34
+ edu_fineweb_350b_tokens-seed-1,0,66000,0.4877283312380314,0.41499999165534973,0.3619999885559082,0.4390000104904175,0.5479999780654907,0.2619999945163727,0.3799999952316284,0.7229999899864197,0.75,0.4000000059604645,0.4009999930858612,0.5730000138282776,0.5619999766349792,0.3582947850227356,0.3783249855041504,0.5389999747276306,0.5389999747276306,0.3403361439704895,0.3598267436027527
35
+ edu_fineweb_350b_tokens-seed-1,0,68000,0.49183737486600876,0.4269999861717224,0.37599998712539673,0.44999998807907104,0.5640000104904175,0.2680000066757202,0.3919999897480011,0.7239999771118164,0.7429999709129333,0.4000000059604645,0.39500001072883606,0.5720000267028809,0.5529999732971191,0.35574087500572205,0.3818822503089905,0.5419999957084656,0.5485000014305115,0.33680978417396545,0.36319905519485474
36
+ edu_fineweb_350b_tokens-seed-1,0,70000,0.49102676659822464,0.4189999997615814,0.35899999737739563,0.4480000138282776,0.5659999847412109,0.25600001215934753,0.38999998569488525,0.7149999737739563,0.7459999918937683,0.4000000059604645,0.3959999978542328,0.5529999732971191,0.5649999976158142,0.35513052344322205,0.379595547914505,0.5274999737739563,0.5454999804496765,0.337499737739563,0.3607141971588135
37
+ edu_fineweb_350b_tokens-seed-1,0,72000,0.4916461184620857,0.4169999957084656,0.35600000619888306,0.45500001311302185,0.5649999976158142,0.257999986410141,0.38999998569488525,0.7350000143051147,0.75,0.3869999945163727,0.4000000059604645,0.5590000152587891,0.5529999732971191,0.3563148081302643,0.38194817304611206,0.5429999828338623,0.5559999942779541,0.3379554748535156,0.36316898465156555
38
+ edu_fineweb_350b_tokens-seed-1,0,74000,0.49113815650343895,0.41100001335144043,0.36399999260902405,0.45899999141693115,0.5659999847412109,0.2720000147819519,0.3880000114440918,0.7329999804496765,0.7459999918937683,0.3889999985694885,0.40700000524520874,0.5600000023841858,0.5529999732971191,0.356362909078598,0.38203853368759155,0.5339999794960022,0.5414999723434448,0.33809807896614075,0.3636053204536438
39
+ edu_fineweb_350b_tokens-seed-1,0,76000,0.4931333474814892,0.4169999957084656,0.3630000054836273,0.453000009059906,0.5569999814033508,0.27399998903274536,0.40400001406669617,0.7279999852180481,0.746999979019165,0.4009999930858612,0.40400001406669617,0.5529999732971191,0.5479999780654907,0.3585507571697235,0.38335856795310974,0.5429999828338623,0.5575000047683716,0.34024208784103394,0.3645668029785156
40
+ edu_fineweb_350b_tokens-seed-1,0,78000,0.49161795899271965,0.4180000126361847,0.3619999885559082,0.453000009059906,0.5630000233650208,0.27799999713897705,0.38999998569488525,0.7329999804496765,0.7419999837875366,0.40400001406669617,0.40400001406669617,0.5580000281333923,0.5580000281333923,0.358989417552948,0.3820967674255371,0.5460000038146973,0.5504999756813049,0.34032127261161804,0.36344367265701294
41
+ edu_fineweb_350b_tokens-seed-1,0,80000,0.4911441020667553,0.41600000858306885,0.35899999737739563,0.44699999690055847,0.5740000009536743,0.2619999945163727,0.39399999380111694,0.7300000190734863,0.7540000081062317,0.39800000190734863,0.39800000190734863,0.5490000247955322,0.5419999957084656,0.35973435640335083,0.3821263015270233,0.5370000004768372,0.5444999933242798,0.3422233760356903,0.3636528253555298
42
+ edu_fineweb_350b_tokens-seed-1,0,82000,0.4927275590598583,0.42399999499320984,0.36500000953674316,0.4410000145435333,0.5580000281333923,0.2919999957084656,0.3959999978542328,0.7250000238418579,0.7390000224113464,0.39100000262260437,0.40700000524520874,0.5720000267028809,0.5590000152587891,0.36026227474212646,0.38345789909362793,0.5600000023841858,0.5529999732971191,0.341264009475708,0.3648204207420349
43
+ edu_fineweb_350b_tokens-seed-1,0,84000,0.49356314539909363,0.4350000023841858,0.367000013589859,0.45399999618530273,0.5659999847412109,0.2800000011920929,0.39800000190734863,0.7260000109672546,0.7369999885559082,0.39399999380111694,0.40700000524520874,0.5450000166893005,0.5529999732971191,0.3620130121707916,0.38375842571258545,0.5640000104904175,0.5554999709129333,0.3433130979537964,0.365005224943161
44
+ edu_fineweb_350b_tokens-seed-1,0,86000,0.4939306415617466,0.42100000381469727,0.367000013589859,0.45100000500679016,0.5709999799728394,0.2840000092983246,0.3880000114440918,0.7269999980926514,0.7519999742507935,0.39399999380111694,0.4059999883174896,0.5820000171661377,0.5669999718666077,0.3596394956111908,0.3779442608356476,0.5419999957084656,0.5419999957084656,0.34093979001045227,0.3584451973438263
45
+ edu_fineweb_350b_tokens-seed-1,0,88000,0.5010630041360855,0.42500001192092896,0.3709999918937683,0.45500001311302185,0.5789999961853027,0.2680000066757202,0.39800000190734863,0.7310000061988831,0.7540000081062317,0.41100001335144043,0.41200000047683716,0.5839999914169312,0.5649999976158142,0.3666640520095825,0.3865112364292145,0.5575000047683716,0.5625,0.34814321994781494,0.36700403690338135
46
+ edu_fineweb_350b_tokens-seed-1,0,90000,0.49995486810803413,0.41100001335144043,0.3630000054836273,0.4560000002384186,0.574999988079071,0.27399998903274536,0.40400001406669617,0.7300000190734863,0.7570000290870667,0.4059999883174896,0.41200000047683716,0.5709999799728394,0.5600000023841858,0.3633595407009125,0.38647565245628357,0.5565000176429749,0.5615000128746033,0.34486615657806396,0.36713889241218567
47
+ edu_fineweb_350b_tokens-seed-1,0,92000,0.49723950773477554,0.42899999022483826,0.3709999918937683,0.45399999618530273,0.5649999976158142,0.2800000011920929,0.39800000190734863,0.7329999804496765,0.7519999742507935,0.3970000147819519,0.4180000126361847,0.578000009059906,0.5619999766349792,0.3627259135246277,0.3835802674293518,0.546500027179718,0.5475000143051147,0.3440909683704376,0.36441609263420105
48
+ edu_fineweb_350b_tokens-seed-1,0,94000,0.4972243830561638,0.42500001192092896,0.3610000014305115,0.4519999921321869,0.5709999799728394,0.2840000092983246,0.40400001406669617,0.7279999852180481,0.7459999918937683,0.4050000011920929,0.41499999165534973,0.5690000057220459,0.5600000023841858,0.36130890250205994,0.3823433518409729,0.5559999942779541,0.5580000281333923,0.34228208661079407,0.36279505491256714
49
+ edu_fineweb_350b_tokens-seed-1,0,96000,0.4945024959743023,0.41100001335144043,0.38100001215934753,0.45500001311302185,0.5730000138282776,0.2759999930858612,0.38600000739097595,0.7329999804496765,0.75,0.39899998903274536,0.4000000059604645,0.5720000267028809,0.5509999990463257,0.3607921898365021,0.382955938577652,0.5414999723434448,0.5509999990463257,0.34249985218048096,0.3640199303627014
50
+ edu_fineweb_350b_tokens-seed-1,0,98000,0.4953378774225712,0.40700000524520874,0.367000013589859,0.46299999952316284,0.5740000009536743,0.27799999713897705,0.3919999897480011,0.7269999980926514,0.7559999823570251,0.39399999380111694,0.40700000524520874,0.5759999752044678,0.5540000200271606,0.3606520891189575,0.3814857304096222,0.5460000038146973,0.5504999756813049,0.34219974279403687,0.36220303177833557
51
+ edu_fineweb_350b_tokens-seed-1,0,100000,0.499586995691061,0.41200000047683716,0.3659999966621399,0.4620000123977661,0.5830000042915344,0.26600000262260437,0.4000000059604645,0.7250000238418579,0.7590000033378601,0.4009999930858612,0.4099999964237213,0.5720000267028809,0.5720000267028809,0.35491371154785156,0.3793718218803406,0.5419999957084656,0.5475000143051147,0.3359191417694092,0.35919591784477234
52
+ edu_fineweb_350b_tokens-seed-1,0,102000,0.4977268613874912,0.41499999165534973,0.38199999928474426,0.4580000042915344,0.574999988079071,0.2720000147819519,0.414000004529953,0.7229999899864197,0.7509999871253967,0.3970000147819519,0.4129999876022339,0.5619999766349792,0.5509999990463257,0.3581334054470062,0.3803453743457794,0.5354999899864197,0.5350000262260437,0.3400118052959442,0.3608148992061615
53
+ edu_fineweb_350b_tokens-seed-1,0,104000,0.4989877715706825,0.41100001335144043,0.36000001430511475,0.46299999952316284,0.5709999799728394,0.27399998903274536,0.39399999380111694,0.7329999804496765,0.7570000290870667,0.414000004529953,0.41499999165534973,0.5709999799728394,0.5590000152587891,0.3643620014190674,0.38796037435531616,0.559499979019165,0.5669999718666077,0.34558823704719543,0.36890217661857605
54
+ edu_fineweb_350b_tokens-seed-1,0,106000,0.4987182281911373,0.41499999165534973,0.3700000047683716,0.4659999907016754,0.578000009059906,0.2840000092983246,0.4059999883174896,0.7260000109672546,0.7480000257492065,0.40700000524520874,0.4090000092983246,0.5680000185966492,0.5550000071525574,0.3635769784450531,0.38422322273254395,0.5450000166893005,0.5590000152587891,0.3452018201351166,0.36474576592445374
55
+ edu_fineweb_350b_tokens-seed-1,0,108000,0.5014015696942806,0.4320000112056732,0.36899998784065247,0.4620000123977661,0.5860000252723694,0.2840000092983246,0.40400001406669617,0.7350000143051147,0.75,0.40299999713897705,0.41100001335144043,0.5630000233650208,0.5699999928474426,0.36332157254219055,0.3844863772392273,0.5550000071525574,0.5565000176429749,0.34433162212371826,0.36471250653266907
56
+ edu_fineweb_350b_tokens-seed-1,0,110000,0.5043375752866268,0.4230000078678131,0.37700000405311584,0.4620000123977661,0.5730000138282776,0.2879999876022339,0.41200000047683716,0.7419999837875366,0.7749999761581421,0.39899998903274536,0.414000004529953,0.5720000267028809,0.5590000152587891,0.36554232239723206,0.3870066702365875,0.5554999709129333,0.5575000047683716,0.3467237055301666,0.3672005832195282
57
+ edu_fineweb_350b_tokens-seed-1,0,112000,0.5046022236347198,0.4020000100135803,0.36800000071525574,0.4650000035762787,0.5839999914169312,0.2840000092983246,0.4300000071525574,0.7310000061988831,0.7540000081062317,0.40400001406669617,0.40700000524520874,0.5720000267028809,0.5569999814033508,0.367435097694397,0.39113253355026245,0.5565000176429749,0.5649999976158142,0.34933826327323914,0.37181779742240906
58
+ edu_fineweb_350b_tokens-seed-1,0,114000,0.5044217295944691,0.4230000078678131,0.3709999918937683,0.4659999907016754,0.5830000042915344,0.2939999997615814,0.4180000126361847,0.7289999723434448,0.7549999952316284,0.4020000100135803,0.40700000524520874,0.5709999799728394,0.5640000104904175,0.36639603972435,0.38989707827568054,0.5580000281333923,0.5669999718666077,0.34762707352638245,0.37037384510040283
59
+ edu_fineweb_350b_tokens-seed-1,0,116000,0.5037211365997791,0.41600000858306885,0.3720000088214874,0.46399998664855957,0.5899999737739563,0.3059999942779541,0.41200000047683716,0.7300000190734863,0.7540000081062317,0.4020000100135803,0.41200000047683716,0.5659999847412109,0.5600000023841858,0.3651641011238098,0.3900282680988312,0.5569999814033508,0.5590000152587891,0.34627485275268555,0.37076908349990845
60
+ edu_fineweb_350b_tokens-seed-1,0,118000,0.5019807890057564,0.41200000047683716,0.3799999952316284,0.46399998664855957,0.5889999866485596,0.2879999876022339,0.4059999883174896,0.7419999837875366,0.7630000114440918,0.4009999930858612,0.39800000190734863,0.5600000023841858,0.5609999895095825,0.36456480622291565,0.3854806423187256,0.5534999966621399,0.5529999732971191,0.34601250290870667,0.3658463656902313
61
+ edu_fineweb_350b_tokens-seed-1,0,120000,0.5008365102112293,0.4259999990463257,0.367000013589859,0.47200000286102295,0.5799999833106995,0.29600000381469727,0.41600000858306885,0.734000027179718,0.7580000162124634,0.3970000147819519,0.40400001406669617,0.5559999942779541,0.5550000071525574,0.3660300672054291,0.3887145519256592,0.5529999732971191,0.5569999814033508,0.34745535254478455,0.3696920573711395
62
+ edu_fineweb_350b_tokens-seed-1,0,122000,0.5028450302779675,0.41999998688697815,0.3700000047683716,0.4659999907016754,0.5929999947547913,0.2919999957084656,0.4020000100135803,0.7400000095367432,0.7590000033378601,0.4000000059604645,0.40400001406669617,0.5659999847412109,0.574999988079071,0.365267276763916,0.3863743841648102,0.5540000200271606,0.5529999732971191,0.3464977741241455,0.36676025390625
63
+ edu_fineweb_350b_tokens-seed-1,0,124000,0.5072730705142021,0.41499999165534973,0.3720000088214874,0.47099998593330383,0.5860000252723694,0.2919999957084656,0.42800000309944153,0.734000027179718,0.7620000243186951,0.3930000066757202,0.40700000524520874,0.5669999718666077,0.5649999976158142,0.365468293428421,0.3888309895992279,0.5544999837875366,0.5695000290870667,0.3469199538230896,0.36868447065353394
64
+ edu_fineweb_350b_tokens-seed-1,0,126000,0.501251045614481,0.4180000126361847,0.3659999966621399,0.47099998593330383,0.5839999914169312,0.2879999876022339,0.41999998688697815,0.7369999885559082,0.7630000114440918,0.39500001072883606,0.4059999883174896,0.5600000023841858,0.5580000281333923,0.3610328137874603,0.3824765980243683,0.5485000014305115,0.5504999756813049,0.3421251177787781,0.3625083863735199
65
+ edu_fineweb_350b_tokens-seed-1,0,128000,0.5086825042963028,0.42800000309944153,0.3779999911785126,0.46399998664855957,0.5860000252723694,0.28200000524520874,0.41999998688697815,0.7369999885559082,0.7580000162124634,0.3869999945163727,0.41499999165534973,0.5730000138282776,0.5680000185966492,0.3646826446056366,0.3922956585884094,0.5584999918937683,0.5720000267028809,0.3459012806415558,0.3724599778652191
66
+ edu_fineweb_350b_tokens-seed-1,0,130000,0.5078329555690289,0.4339999854564667,0.37400001287460327,0.46299999952316284,0.5849999785423279,0.2919999957084656,0.41999998688697815,0.7350000143051147,0.7570000290870667,0.3889999985694885,0.41600000858306885,0.5669999718666077,0.5720000267028809,0.3667779266834259,0.39438962936401367,0.559499979019165,0.5634999871253967,0.34809762239456177,0.37516361474990845
67
+ edu_fineweb_350b_tokens-seed-1,0,132000,0.5061904042959213,0.4230000078678131,0.3659999966621399,0.4659999907016754,0.5879999995231628,0.28999999165534973,0.41999998688697815,0.746999979019165,0.7630000114440918,0.4050000011920929,0.4090000092983246,0.5820000171661377,0.5659999847412109,0.3683132827281952,0.39066654443740845,0.5619999766349792,0.5665000081062317,0.34918180108070374,0.3710232377052307
68
+ edu_fineweb_350b_tokens-seed-1,0,134000,0.5059116296470165,0.4189999997615814,0.3709999918937683,0.47600001096725464,0.5870000123977661,0.2939999997615814,0.41999998688697815,0.7409999966621399,0.7570000290870667,0.4000000059604645,0.4099999964237213,0.5720000267028809,0.5669999718666077,0.3706933557987213,0.3925185203552246,0.5615000128746033,0.5619999766349792,0.3521064519882202,0.37329307198524475
69
+ edu_fineweb_350b_tokens-seed-1,0,138000,0.5042317919433117,0.4230000078678131,0.367000013589859,0.4740000069141388,0.5860000252723694,0.2879999876022339,0.41600000858306885,0.7390000224113464,0.765999972820282,0.3959999978542328,0.3970000147819519,0.5740000009536743,0.5609999895095825,0.370014488697052,0.39456456899642944,0.5630000233650208,0.5649999976158142,0.35142001509666443,0.3758543133735657
70
+ edu_fineweb_350b_tokens-seed-1,0,140000,0.5045665018260479,0.41999998688697815,0.3799999952316284,0.4690000116825104,0.5820000171661377,0.2720000147819519,0.4099999964237213,0.7369999885559082,0.7670000195503235,0.3970000147819519,0.4000000059604645,0.5609999895095825,0.5590000152587891,0.37111103534698486,0.3926049470901489,0.5590000152587891,0.5649999976158142,0.35347750782966614,0.3735319674015045
71
+ edu_fineweb_350b_tokens-seed-1,0,142000,0.5085432901978493,0.421999990940094,0.3709999918937683,0.47200000286102295,0.5950000286102295,0.27000001072883606,0.414000004529953,0.7329999804496765,0.7630000114440918,0.3930000066757202,0.40400001406669617,0.5720000267028809,0.5680000185966492,0.369999498128891,0.39443445205688477,0.565500020980835,0.578499972820282,0.3518766164779663,0.37484627962112427
72
+ edu_fineweb_350b_tokens-seed-1,0,144000,0.5049711987376213,0.43299999833106995,0.36899998784065247,0.46399998664855957,0.5989999771118164,0.2720000147819519,0.40799999237060547,0.7329999804496765,0.7590000033378601,0.3880000114440918,0.3959999978542328,0.5659999847412109,0.5690000057220459,0.36937984824180603,0.3920287489891052,0.5665000081062317,0.5669999718666077,0.3512401580810547,0.37276965379714966
73
+ edu_fineweb_350b_tokens-seed-1,0,148000,0.5123470723628998,0.4269999861717224,0.3790000081062317,0.47099998593330383,0.5950000286102295,0.2800000011920929,0.42399999499320984,0.7310000061988831,0.7649999856948853,0.39899998903274536,0.41100001335144043,0.5770000219345093,0.5770000219345093,0.36998581886291504,0.3943348228931427,0.5674999952316284,0.5734999775886536,0.3513873219490051,0.3742765486240387
74
+ edu_fineweb_350b_tokens-seed-1,0,150000,0.509139034897089,0.41100001335144043,0.3779999911785126,0.4740000069141388,0.593999981880188,0.2720000147819519,0.41200000047683716,0.7319999933242798,0.7649999856948853,0.4009999930858612,0.40400001406669617,0.5680000185966492,0.5649999976158142,0.3709569573402405,0.3972984850406647,0.5740000009536743,0.5770000219345093,0.35274040699005127,0.3781122863292694
75
+ edu_fineweb_350b_tokens-seed-1,0,152000,0.5079653523862362,0.4169999957084656,0.36800000071525574,0.4729999899864197,0.5960000157356262,0.2800000011920929,0.40799999237060547,0.7390000224113464,0.7630000114440918,0.39800000190734863,0.40400001406669617,0.5720000267028809,0.574999988079071,0.3738082945346832,0.3963184654712677,0.5715000033378601,0.5724999904632568,0.3557112216949463,0.37722280621528625
76
+ edu_fineweb_350b_tokens-seed-1,0,154000,0.5107964277267456,0.4230000078678131,0.3700000047683716,0.4749999940395355,0.5960000157356262,0.2879999876022339,0.41999998688697815,0.7350000143051147,0.7639999985694885,0.4000000059604645,0.41200000047683716,0.5730000138282776,0.5709999799728394,0.37119078636169434,0.39557957649230957,0.5720000267028809,0.5774999856948853,0.3524456322193146,0.3758714497089386
77
+ edu_fineweb_350b_tokens-seed-1,0,156000,0.5088927485048771,0.41999998688697815,0.3709999918937683,0.46700000762939453,0.5910000205039978,0.2919999957084656,0.414000004529953,0.7369999885559082,0.765999972820282,0.39399999380111694,0.4059999883174896,0.5799999833106995,0.5669999718666077,0.3713153600692749,0.3947707414627075,0.5745000243186951,0.5809999704360962,0.35257014632225037,0.37514206767082214
78
+ edu_fineweb_350b_tokens-seed-1,0,158000,0.5088703669607639,0.41999998688697815,0.375,0.4699999988079071,0.5979999899864197,0.28200000524520874,0.40400001406669617,0.7450000047683716,0.7680000066757202,0.39500001072883606,0.4009999930858612,0.574999988079071,0.574999988079071,0.36792638897895813,0.3940982520580292,0.5680000185966492,0.5755000114440918,0.3489862382411957,0.37446293234825134
79
+ edu_fineweb_350b_tokens-seed-1,0,160000,0.5071291252970695,0.4300000071525574,0.35899999737739563,0.4729999899864197,0.5929999947547913,0.28200000524520874,0.4180000126361847,0.7440000176429749,0.7630000114440918,0.3919999897480011,0.4020000100135803,0.5759999752044678,0.574999988079071,0.36913686990737915,0.3938981890678406,0.5669999718666077,0.5724999904632568,0.3502262532711029,0.3745329976081848
80
+ edu_fineweb_350b_tokens-seed-1,0,162000,0.5091184116899967,0.41600000858306885,0.367000013589859,0.4740000069141388,0.5920000076293945,0.2879999876022339,0.40799999237060547,0.746999979019165,0.7689999938011169,0.38999998569488525,0.4090000092983246,0.5720000267028809,0.5770000219345093,0.36741992831230164,0.39286142587661743,0.5720000267028809,0.578000009059906,0.3482683300971985,0.37294724583625793
81
+ edu_fineweb_350b_tokens-seed-1,0,164000,0.5078432261943817,0.41600000858306885,0.36500000953674316,0.46700000762939453,0.5910000205039978,0.2759999930858612,0.40799999237060547,0.7369999885559082,0.7689999938011169,0.39500001072883606,0.4059999883174896,0.5759999752044678,0.5799999833106995,0.36831894516944885,0.3920001685619354,0.5634999871253967,0.5715000033378601,0.3499426543712616,0.37224581837654114
82
+ edu_fineweb_350b_tokens-seed-1,0,166000,0.5083079226315022,0.41499999165534973,0.36399999260902405,0.47200000286102295,0.5929999947547913,0.28200000524520874,0.414000004529953,0.7400000095367432,0.7680000066757202,0.4009999930858612,0.40799999237060547,0.574999988079071,0.5699999928474426,0.37059345841407776,0.3931756019592285,0.5640000104904175,0.5759999752044678,0.3522030711174011,0.3734634220600128
83
+ edu_fineweb_350b_tokens-seed-1,0,167000,0.509494174271822,0.42899999022483826,0.3619999885559082,0.47200000286102295,0.597000002861023,0.28999999165534973,0.4180000126361847,0.7379999756813049,0.7689999938011169,0.39500001072883606,0.40400001406669617,0.5820000171661377,0.578000009059906,0.3696656823158264,0.3941360414028168,0.5669999718666077,0.5734999775886536,0.3506713807582855,0.3744533956050873
notebooks/minhash_params.ipynb ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 8,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "{'prob': {'file': 'prob.json'}}"
12
+ ]
13
+ },
14
+ "execution_count": 8,
15
+ "metadata": {},
16
+ "output_type": "execute_result"
17
+ }
18
+ ],
19
+ "source": [
20
+ "import json\n",
21
+ "import numpy as np\n",
22
+ "import plotly.graph_objects as go\n",
23
+ "RED_FULL=\"rgba(255, 0, 0, 1)\"\n",
24
+ "\n",
25
+ "# Define the function 1 - (1 - x^8)^14\n",
26
+ "def func1(x):\n",
27
+ " return 1 - np.power(1 - np.power(x, 8), 14)\n",
28
+ "\n",
29
+ "# Define the function 1 - (1 - x^20)^450\n",
30
+ "def func2(x):\n",
31
+ " return 1 - np.power(1 - np.power(x, 20), 450)\n",
32
+ "\n",
33
+ "# Generate x values from 0 to 1\n",
34
+ "x = np.linspace(0, 1, 1000)\n",
35
+ "\n",
36
+ "# Calculate y values for each function\n",
37
+ "y1 = func1(x)\n",
38
+ "y2 = func2(x)\n",
39
+ "\n",
40
+ "# Create traces\n",
41
+ "trace1 = go.Scatter(x=x, y=y1, mode='lines', name='FineWeb: 1-(1-s^8)^14')\n",
42
+ "trace2 = go.Scatter(x=x, y=y2, mode='lines', name='RefinedWeb: 1-(1-s^20)^450')\n",
43
+ "vertical_line = go.Scatter(x=[0.75, 0.75], y=[0, 1], mode='lines', line=dict(color='red', dash='dash'), name='Threshold')\n",
44
+ "\n",
45
+ "# Define layout\n",
46
+ "layout = {\n",
47
+ " 'title': {\n",
48
+ " 'text': 'MinHash parameters',\n",
49
+ " },\n",
50
+ " 'xaxis': {\n",
51
+ " 'title': {\n",
52
+ " 'text': 'Document similarity (s)',\n",
53
+ " },\n",
54
+ " },\n",
55
+ " 'yaxis': {\n",
56
+ " 'title': {\n",
57
+ " 'text': 'Matched as dups probability',\n",
58
+ " },\n",
59
+ " },\n",
60
+ "}\n",
61
+ "\n",
62
+ "\n",
63
+ "def normalize_run_name(run_name):\n",
64
+ " return run_name.replace(\"/\", \"_\")\n",
65
+ "\n",
66
+ "\n",
67
+ "def save_for_plot(dir_name, df, views, xlabel=\"Dataset\", ylabel=\"Matched as dups probability\", plot_name=\"plot name\", custom_layout={}, ranges={}, x_column=None, default_metric=None):\n",
68
+ " import os\n",
69
+ " files = {}\n",
70
+ " os.makedirs(f\"data/plots/{dir_name}\", exist_ok=True)\n",
71
+ " for view in views:\n",
72
+ " data = {}\n",
73
+ " for run_name in df[\"runname\"].unique():\n",
74
+ " run_name_only=df[df[\"runname\"]==run_name]\n",
75
+ " data[run_name] = {\n",
76
+ " \"x\": run_name_only[x_column].tolist() if x_column else [run_name],\n",
77
+ " \"y\": run_name_only[view].tolist(),\n",
78
+ " \"label\": run_name,\n",
79
+ " }\n",
80
+ " file_name = f\"{normalize_run_name(view)}.json\"\n",
81
+ " files[view] = {\"file\": f\"{file_name}\"}\n",
82
+ " with open(f\"data/plots/{dir_name}/{file_name}\", \"w\") as f:\n",
83
+ " json.dump({\n",
84
+ " \"data\": data,\n",
85
+ " \"layout\": {\n",
86
+ " \"title\": {\n",
87
+ " \"text\": plot_name,\n",
88
+ " },\n",
89
+ " \"xaxis\": {\n",
90
+ " \"title\": {\n",
91
+ " \"text\": xlabel,\n",
92
+ " },\n",
93
+ " },\n",
94
+ " \"yaxis\": {\n",
95
+ " # \"range\": ranges.get(view, None),\n",
96
+ " \"title\": {\n",
97
+ " \"text\": ylabel,\n",
98
+ " },\n",
99
+ " },\n",
100
+ " \"shapes\": [\n",
101
+ " {\n",
102
+ " \"type\": \"line\",\n",
103
+ " \"x0\": 0.75,\n",
104
+ " \"y0\": 0.0,\n",
105
+ " \"x1\": 0.75,\n",
106
+ " \"y1\": 1.2,\n",
107
+ " \"xref\": \"x\",\n",
108
+ " \"yref\": \"y\",\n",
109
+ " \"line\": {\n",
110
+ " \"color\": RED_FULL,\n",
111
+ " \"width\": 1,\n",
112
+ " \"dash\": \"dashdot\"\n",
113
+ " },\n",
114
+ " \"showarrow\": False\n",
115
+ " }\n",
116
+ " ],\n",
117
+ " **custom_layout,\n",
118
+ " },\n",
119
+ " }, f)\n",
120
+ " with open(f\"data/plots/{dir_name}/index.json\", \"w\") as f:\n",
121
+ " json.dump({\n",
122
+ " \"files\": files,\n",
123
+ " \"settings\": {\n",
124
+ " \"defaultMetric\": default_metric,\n",
125
+ " \"slider\": None,\n",
126
+ " \"autoSetXRange\": False,\n",
127
+ " }\n",
128
+ " }, f)\n",
129
+ " return files\n",
130
+ "\n",
131
+ "import pandas as pd\n",
132
+ "df = pd.DataFrame({\n",
133
+ " \"runname\": [\"FineWeb: 1-(1-s^8)^14\"]*len(x) + [\"RefinedWeb: 1-(1-s^20)^450\"]*len(x),\n",
134
+ " \"similarity\": x.tolist()+x.tolist(),\n",
135
+ " \"prob\": y1.tolist()+y2.tolist(),\n",
136
+ " \"view\": [\"normal\"]*2*len(x)\n",
137
+ "})\n",
138
+ "\n",
139
+ "custom_layout = {\n",
140
+ " \"legend\": {\n",
141
+ " \"orientation\": \"v\",\n",
142
+ " \"xanchor\": \"left\",\n",
143
+ " \"yanchor\": \"top\",\n",
144
+ " \"x\": 0,\n",
145
+ " \"y\": 1,\n",
146
+ " },\n",
147
+ "}\n",
148
+ "\n",
149
+ "save_for_plot(\"minhash_params\", df, [\"prob\"], xlabel=\"Document similarity (s)\", plot_name=\"MinHash parameters\", custom_layout=custom_layout, ranges={}, x_column=\"similarity\", default_metric=\"prob\")"
150
+ ]
151
+ }
152
+ ],
153
+ "metadata": {
154
+ "kernelspec": {
155
+ "display_name": "datatrove",
156
+ "language": "python",
157
+ "name": "python3"
158
+ },
159
+ "language_info": {
160
+ "codemirror_mode": {
161
+ "name": "ipython",
162
+ "version": 3
163
+ },
164
+ "file_extension": ".py",
165
+ "mimetype": "text/x-python",
166
+ "name": "python",
167
+ "nbconvert_exporter": "python",
168
+ "pygments_lexer": "ipython3",
169
+ "version": "3.12.2"
170
+ }
171
+ },
172
+ "nbformat": 4,
173
+ "nbformat_minor": 2
174
+ }
notebooks/modify_jsons.ipynb ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 13,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import json\n",
11
+ "import orjson\n",
12
+ "\n",
13
+ "\n",
14
+ "def normalize_file_name(file_name):\n",
15
+ " return file_name.replace('/', '_')\n",
16
+ "\n",
17
+ "\n",
18
+ "def keep_key(key):\n",
19
+ " if key.endswith(\"acc\"):\n",
20
+ " return False\n",
21
+ " \n",
22
+ " if \"sciq\" in key:\n",
23
+ " return False\n",
24
+ "\n",
25
+ " if \"siqa\" in key:\n",
26
+ " return False\n",
27
+ "\n",
28
+ " return True\n",
29
+ "\n",
30
+ "\n",
31
+ "def get_slider_max(data):\n",
32
+ " metrics = data[list(data.keys())[0]]\n",
33
+ " metric_data = metrics[list(metrics.keys())[0]]\n",
34
+ " samples = len(metric_data[\"x\"])\n",
35
+ " if samples < 20:\n",
36
+ " return 10\n",
37
+ " return 30\n",
38
+ "\n",
39
+ "\n",
40
+ "def create_index(data, traces, layout, default_window_size, default_metric):\n",
41
+ " print(default_metric if default_metric else \"None\")\n",
42
+ " files_data = {}\n",
43
+ " index_files = {}\n",
44
+ " for task_id, task_data in (data.items() if data else traces.items()):\n",
45
+ " data_name = \"data\" if data else \"traces\"\n",
46
+ " files_data[task_id] = {\n",
47
+ " data_name: task_data,\n",
48
+ " \"layout\": layout\n",
49
+ " }\n",
50
+ " index_files[task_id] = {\n",
51
+ " \"file\": f\"{normalize_file_name(task_id)}.json\"\n",
52
+ " }\n",
53
+ " settings = {\n",
54
+ " \"slider\": {\n",
55
+ " \"min\": 0,\n",
56
+ " \"max\": get_slider_max(data),\n",
57
+ " \"default\": default_window_size,\n",
58
+ " },\n",
59
+ " \"defaultMetric\": default_metric\n",
60
+ " } if data else {\"slider\": None}\n",
61
+ " \n",
62
+ " return files_data, index_files, settings\n",
63
+ " \n",
64
+ " \n",
65
+ "\n",
66
+ "new_data = {}\n",
67
+ "\n",
68
+ "for file_name in os.listdir('./data/plots'):\n",
69
+ " if not file_name.endswith('.json'):\n",
70
+ " continue\n",
71
+ " with open(f'./data/plots/{file_name}', 'r') as file:\n",
72
+ " old_data = orjson.loads(file.read())\n",
73
+ " data = {key: value for key, value in old_data[\"data\"].items() if keep_key(key)} if \"data\" in old_data else {}\n",
74
+ " traces = {key: value for key, value in old_data[\"traces\"].items()} if \"traces\" in old_data else {}\n",
75
+ " default_window_size = old_data[\"defaultWindowSize\"] if \"defaultWindowSize\" in old_data else None\n",
76
+ " default_metric = old_data[\"defaultMetric\"] if \"defaultMetric\" in old_data else None\n",
77
+ " files_data, index_files, settings = create_index(data, traces, old_data[\"layout\"], default_window_size, default_metric)\n",
78
+ " # mkdir\n",
79
+ " dir_name = file_name.split('.')[0]\n",
80
+ " os.makedirs(f'./data/plots/{dir_name}', exist_ok=True)\n",
81
+ " with open(f'./data/plots/{dir_name}/index.json', 'wb') as file:\n",
82
+ " file.write(orjson.dumps({\n",
83
+ " \"files\": index_files,\n",
84
+ " \"settings\": settings,\n",
85
+ " }))\n",
86
+ " \n",
87
+ " for metric_name, data in files_data.items():\n",
88
+ " with open(f'./data/plots/{dir_name}/{normalize_file_name(metric_name)}.json', 'wb') as file:\n",
89
+ " file.write(orjson.dumps(data))\n",
90
+ "\n",
91
+ "\n"
92
+ ]
93
+ }
94
+ ],
95
+ "metadata": {
96
+ "kernelspec": {
97
+ "display_name": "datatrove3.10",
98
+ "language": "python",
99
+ "name": "python3"
100
+ },
101
+ "language_info": {
102
+ "codemirror_mode": {
103
+ "name": "ipython",
104
+ "version": 3
105
+ },
106
+ "file_extension": ".py",
107
+ "mimetype": "text/x-python",
108
+ "name": "python",
109
+ "nbconvert_exporter": "python",
110
+ "pygments_lexer": "ipython3",
111
+ "version": "3.10.6"
112
+ }
113
+ },
114
+ "nbformat": 4,
115
+ "nbformat_minor": 2
116
+ }
notebooks/plot_all-filtering-steps.ipynb ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "138889b92720ce2e",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2024-05-14T09:02:09.162993Z",
10
+ "start_time": "2024-05-14T09:02:09.134625Z"
11
+ },
12
+ "collapsed": false
13
+ },
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/html": [
18
+ "<div>\n",
19
+ "<style scoped>\n",
20
+ " .dataframe tbody tr th:only-of-type {\n",
21
+ " vertical-align: middle;\n",
22
+ " }\n",
23
+ "\n",
24
+ " .dataframe tbody tr th {\n",
25
+ " vertical-align: top;\n",
26
+ " }\n",
27
+ "\n",
28
+ " .dataframe thead th {\n",
29
+ " text-align: right;\n",
30
+ " }\n",
31
+ "</style>\n",
32
+ "<table border=\"1\" class=\"dataframe\">\n",
33
+ " <thead>\n",
34
+ " <tr style=\"text-align: right;\">\n",
35
+ " <th></th>\n",
36
+ " <th>runname</th>\n",
37
+ " <th>seed</th>\n",
38
+ " <th>steps</th>\n",
39
+ " <th>agg_score</th>\n",
40
+ " <th>commonsense_qa/acc</th>\n",
41
+ " <th>commonsense_qa/acc_norm</th>\n",
42
+ " <th>hellaswag/acc</th>\n",
43
+ " <th>hellaswag/acc_norm</th>\n",
44
+ " <th>openbookqa/acc</th>\n",
45
+ " <th>openbookqa/acc_norm</th>\n",
46
+ " <th>...</th>\n",
47
+ " <th>siqa/acc</th>\n",
48
+ " <th>siqa/acc_norm</th>\n",
49
+ " <th>winogrande/acc</th>\n",
50
+ " <th>winogrande/acc_norm</th>\n",
51
+ " <th>sciq/acc</th>\n",
52
+ " <th>sciq/acc_norm</th>\n",
53
+ " <th>arc/acc</th>\n",
54
+ " <th>arc/acc_norm</th>\n",
55
+ " <th>mmlu/acc</th>\n",
56
+ " <th>mmlu/acc_norm</th>\n",
57
+ " </tr>\n",
58
+ " </thead>\n",
59
+ " <tbody>\n",
60
+ " <tr>\n",
61
+ " <th>0</th>\n",
62
+ " <td>big-run-sampled-fineweb-c4-filters</td>\n",
63
+ " <td>6</td>\n",
64
+ " <td>0</td>\n",
65
+ " <td>0.330893</td>\n",
66
+ " <td>0.186</td>\n",
67
+ " <td>0.233</td>\n",
68
+ " <td>0.272</td>\n",
69
+ " <td>0.258</td>\n",
70
+ " <td>0.166</td>\n",
71
+ " <td>0.286</td>\n",
72
+ " <td>...</td>\n",
73
+ " <td>0.367</td>\n",
74
+ " <td>0.362</td>\n",
75
+ " <td>0.516</td>\n",
76
+ " <td>0.497</td>\n",
77
+ " <td>0.208</td>\n",
78
+ " <td>0.202</td>\n",
79
+ " <td>0.2195</td>\n",
80
+ " <td>0.2510</td>\n",
81
+ " <td>0.230294</td>\n",
82
+ " <td>0.250147</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>1</th>\n",
86
+ " <td>big-run-sampled-fineweb-c4-filters</td>\n",
87
+ " <td>6</td>\n",
88
+ " <td>1000</td>\n",
89
+ " <td>0.359303</td>\n",
90
+ " <td>0.250</td>\n",
91
+ " <td>0.263</td>\n",
92
+ " <td>0.293</td>\n",
93
+ " <td>0.285</td>\n",
94
+ " <td>0.140</td>\n",
95
+ " <td>0.276</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>0.376</td>\n",
98
+ " <td>0.401</td>\n",
99
+ " <td>0.497</td>\n",
100
+ " <td>0.479</td>\n",
101
+ " <td>0.594</td>\n",
102
+ " <td>0.524</td>\n",
103
+ " <td>0.2740</td>\n",
104
+ " <td>0.2985</td>\n",
105
+ " <td>0.241617</td>\n",
106
+ " <td>0.251920</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>big-run-sampled-fineweb-c4-filters</td>\n",
111
+ " <td>6</td>\n",
112
+ " <td>2000</td>\n",
113
+ " <td>0.375393</td>\n",
114
+ " <td>0.268</td>\n",
115
+ " <td>0.277</td>\n",
116
+ " <td>0.319</td>\n",
117
+ " <td>0.324</td>\n",
118
+ " <td>0.150</td>\n",
119
+ " <td>0.274</td>\n",
120
+ " <td>...</td>\n",
121
+ " <td>0.372</td>\n",
122
+ " <td>0.411</td>\n",
123
+ " <td>0.507</td>\n",
124
+ " <td>0.484</td>\n",
125
+ " <td>0.688</td>\n",
126
+ " <td>0.606</td>\n",
127
+ " <td>0.3015</td>\n",
128
+ " <td>0.3270</td>\n",
129
+ " <td>0.246577</td>\n",
130
+ " <td>0.259146</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>3</th>\n",
134
+ " <td>big-run-sampled-fineweb-c4-filters</td>\n",
135
+ " <td>6</td>\n",
136
+ " <td>3000</td>\n",
137
+ " <td>0.389655</td>\n",
138
+ " <td>0.303</td>\n",
139
+ " <td>0.305</td>\n",
140
+ " <td>0.324</td>\n",
141
+ " <td>0.358</td>\n",
142
+ " <td>0.152</td>\n",
143
+ " <td>0.280</td>\n",
144
+ " <td>...</td>\n",
145
+ " <td>0.383</td>\n",
146
+ " <td>0.389</td>\n",
147
+ " <td>0.520</td>\n",
148
+ " <td>0.506</td>\n",
149
+ " <td>0.741</td>\n",
150
+ " <td>0.647</td>\n",
151
+ " <td>0.3395</td>\n",
152
+ " <td>0.3405</td>\n",
153
+ " <td>0.255001</td>\n",
154
+ " <td>0.268740</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>4</th>\n",
158
+ " <td>big-run-sampled-fineweb-c4-filters</td>\n",
159
+ " <td>6</td>\n",
160
+ " <td>4000</td>\n",
161
+ " <td>0.401195</td>\n",
162
+ " <td>0.309</td>\n",
163
+ " <td>0.310</td>\n",
164
+ " <td>0.353</td>\n",
165
+ " <td>0.393</td>\n",
166
+ " <td>0.138</td>\n",
167
+ " <td>0.288</td>\n",
168
+ " <td>...</td>\n",
169
+ " <td>0.378</td>\n",
170
+ " <td>0.402</td>\n",
171
+ " <td>0.534</td>\n",
172
+ " <td>0.511</td>\n",
173
+ " <td>0.766</td>\n",
174
+ " <td>0.652</td>\n",
175
+ " <td>0.3395</td>\n",
176
+ " <td>0.3495</td>\n",
177
+ " <td>0.256203</td>\n",
178
+ " <td>0.269056</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>...</th>\n",
182
+ " <td>...</td>\n",
183
+ " <td>...</td>\n",
184
+ " <td>...</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " <td>...</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>...</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " </tr>\n",
204
+ " <tr>\n",
205
+ " <th>667</th>\n",
206
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
207
+ " <td>6</td>\n",
208
+ " <td>163000</td>\n",
209
+ " <td>0.466255</td>\n",
210
+ " <td>0.426</td>\n",
211
+ " <td>0.372</td>\n",
212
+ " <td>0.469</td>\n",
213
+ " <td>0.555</td>\n",
214
+ " <td>0.242</td>\n",
215
+ " <td>0.354</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>0.389</td>\n",
218
+ " <td>0.394</td>\n",
219
+ " <td>0.563</td>\n",
220
+ " <td>0.544</td>\n",
221
+ " <td>0.869</td>\n",
222
+ " <td>0.808</td>\n",
223
+ " <td>0.4460</td>\n",
224
+ " <td>0.4435</td>\n",
225
+ " <td>0.297125</td>\n",
226
+ " <td>0.317543</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>668</th>\n",
230
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
231
+ " <td>6</td>\n",
232
+ " <td>164000</td>\n",
233
+ " <td>0.469743</td>\n",
234
+ " <td>0.431</td>\n",
235
+ " <td>0.376</td>\n",
236
+ " <td>0.467</td>\n",
237
+ " <td>0.556</td>\n",
238
+ " <td>0.232</td>\n",
239
+ " <td>0.356</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>0.391</td>\n",
242
+ " <td>0.397</td>\n",
243
+ " <td>0.568</td>\n",
244
+ " <td>0.552</td>\n",
245
+ " <td>0.861</td>\n",
246
+ " <td>0.800</td>\n",
247
+ " <td>0.4450</td>\n",
248
+ " <td>0.4515</td>\n",
249
+ " <td>0.302706</td>\n",
250
+ " <td>0.318447</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>669</th>\n",
254
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
255
+ " <td>6</td>\n",
256
+ " <td>165000</td>\n",
257
+ " <td>0.469847</td>\n",
258
+ " <td>0.426</td>\n",
259
+ " <td>0.375</td>\n",
260
+ " <td>0.472</td>\n",
261
+ " <td>0.549</td>\n",
262
+ " <td>0.234</td>\n",
263
+ " <td>0.364</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>0.389</td>\n",
266
+ " <td>0.401</td>\n",
267
+ " <td>0.562</td>\n",
268
+ " <td>0.548</td>\n",
269
+ " <td>0.867</td>\n",
270
+ " <td>0.795</td>\n",
271
+ " <td>0.4435</td>\n",
272
+ " <td>0.4475</td>\n",
273
+ " <td>0.297586</td>\n",
274
+ " <td>0.319279</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>670</th>\n",
278
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
279
+ " <td>6</td>\n",
280
+ " <td>166000</td>\n",
281
+ " <td>0.467651</td>\n",
282
+ " <td>0.423</td>\n",
283
+ " <td>0.365</td>\n",
284
+ " <td>0.470</td>\n",
285
+ " <td>0.555</td>\n",
286
+ " <td>0.226</td>\n",
287
+ " <td>0.356</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>0.392</td>\n",
290
+ " <td>0.399</td>\n",
291
+ " <td>0.564</td>\n",
292
+ " <td>0.545</td>\n",
293
+ " <td>0.872</td>\n",
294
+ " <td>0.812</td>\n",
295
+ " <td>0.4365</td>\n",
296
+ " <td>0.4475</td>\n",
297
+ " <td>0.297256</td>\n",
298
+ " <td>0.319704</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>671</th>\n",
302
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
303
+ " <td>6</td>\n",
304
+ " <td>167000</td>\n",
305
+ " <td>0.469652</td>\n",
306
+ " <td>0.416</td>\n",
307
+ " <td>0.373</td>\n",
308
+ " <td>0.469</td>\n",
309
+ " <td>0.560</td>\n",
310
+ " <td>0.234</td>\n",
311
+ " <td>0.356</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>0.392</td>\n",
314
+ " <td>0.394</td>\n",
315
+ " <td>0.565</td>\n",
316
+ " <td>0.557</td>\n",
317
+ " <td>0.867</td>\n",
318
+ " <td>0.803</td>\n",
319
+ " <td>0.4430</td>\n",
320
+ " <td>0.4455</td>\n",
321
+ " <td>0.297409</td>\n",
322
+ " <td>0.317717</td>\n",
323
+ " </tr>\n",
324
+ " </tbody>\n",
325
+ "</table>\n",
326
+ "<p>672 rows × 22 columns</p>\n",
327
+ "</div>"
328
+ ],
329
+ "text/plain": [
330
+ " runname seed steps agg_score \\\n",
331
+ "0 big-run-sampled-fineweb-c4-filters 6 0 0.330893 \n",
332
+ "1 big-run-sampled-fineweb-c4-filters 6 1000 0.359303 \n",
333
+ "2 big-run-sampled-fineweb-c4-filters 6 2000 0.375393 \n",
334
+ "3 big-run-sampled-fineweb-c4-filters 6 3000 0.389655 \n",
335
+ "4 big-run-sampled-fineweb-c4-filters 6 4000 0.401195 \n",
336
+ ".. ... ... ... ... \n",
337
+ "667 big-run-sampled_full_filtered_no_dedup 6 163000 0.466255 \n",
338
+ "668 big-run-sampled_full_filtered_no_dedup 6 164000 0.469743 \n",
339
+ "669 big-run-sampled_full_filtered_no_dedup 6 165000 0.469847 \n",
340
+ "670 big-run-sampled_full_filtered_no_dedup 6 166000 0.467651 \n",
341
+ "671 big-run-sampled_full_filtered_no_dedup 6 167000 0.469652 \n",
342
+ "\n",
343
+ " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n",
344
+ "0 0.186 0.233 0.272 \n",
345
+ "1 0.250 0.263 0.293 \n",
346
+ "2 0.268 0.277 0.319 \n",
347
+ "3 0.303 0.305 0.324 \n",
348
+ "4 0.309 0.310 0.353 \n",
349
+ ".. ... ... ... \n",
350
+ "667 0.426 0.372 0.469 \n",
351
+ "668 0.431 0.376 0.467 \n",
352
+ "669 0.426 0.375 0.472 \n",
353
+ "670 0.423 0.365 0.470 \n",
354
+ "671 0.416 0.373 0.469 \n",
355
+ "\n",
356
+ " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n",
357
+ "0 0.258 0.166 0.286 ... 0.367 \n",
358
+ "1 0.285 0.140 0.276 ... 0.376 \n",
359
+ "2 0.324 0.150 0.274 ... 0.372 \n",
360
+ "3 0.358 0.152 0.280 ... 0.383 \n",
361
+ "4 0.393 0.138 0.288 ... 0.378 \n",
362
+ ".. ... ... ... ... ... \n",
363
+ "667 0.555 0.242 0.354 ... 0.389 \n",
364
+ "668 0.556 0.232 0.356 ... 0.391 \n",
365
+ "669 0.549 0.234 0.364 ... 0.389 \n",
366
+ "670 0.555 0.226 0.356 ... 0.392 \n",
367
+ "671 0.560 0.234 0.356 ... 0.392 \n",
368
+ "\n",
369
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
370
+ "0 0.362 0.516 0.497 0.208 \n",
371
+ "1 0.401 0.497 0.479 0.594 \n",
372
+ "2 0.411 0.507 0.484 0.688 \n",
373
+ "3 0.389 0.520 0.506 0.741 \n",
374
+ "4 0.402 0.534 0.511 0.766 \n",
375
+ ".. ... ... ... ... \n",
376
+ "667 0.394 0.563 0.544 0.869 \n",
377
+ "668 0.397 0.568 0.552 0.861 \n",
378
+ "669 0.401 0.562 0.548 0.867 \n",
379
+ "670 0.399 0.564 0.545 0.872 \n",
380
+ "671 0.394 0.565 0.557 0.867 \n",
381
+ "\n",
382
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
383
+ "0 0.202 0.2195 0.2510 0.230294 0.250147 \n",
384
+ "1 0.524 0.2740 0.2985 0.241617 0.251920 \n",
385
+ "2 0.606 0.3015 0.3270 0.246577 0.259146 \n",
386
+ "3 0.647 0.3395 0.3405 0.255001 0.268740 \n",
387
+ "4 0.652 0.3395 0.3495 0.256203 0.269056 \n",
388
+ ".. ... ... ... ... ... \n",
389
+ "667 0.808 0.4460 0.4435 0.297125 0.317543 \n",
390
+ "668 0.800 0.4450 0.4515 0.302706 0.318447 \n",
391
+ "669 0.795 0.4435 0.4475 0.297586 0.319279 \n",
392
+ "670 0.812 0.4365 0.4475 0.297256 0.319704 \n",
393
+ "671 0.803 0.4430 0.4455 0.297409 0.317717 \n",
394
+ "\n",
395
+ "[672 rows x 22 columns]"
396
+ ]
397
+ },
398
+ "execution_count": 1,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "import pandas as pd\n",
405
+ "from matplotlib.figure import Figure\n",
406
+ "\n",
407
+ "df = pd.read_csv(\"../src_data/all-filters-big-runs.csv\")\n",
408
+ "df"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 2,
414
+ "id": "839a06a71d9183e5",
415
+ "metadata": {
416
+ "ExecuteTime": {
417
+ "end_time": "2024-05-14T09:02:10.094329Z",
418
+ "start_time": "2024-05-14T09:02:10.081683Z"
419
+ }
420
+ },
421
+ "outputs": [
422
+ {
423
+ "data": {
424
+ "text/plain": [
425
+ "['big-run-sampled-fineweb-c4-filters',\n",
426
+ " 'big-run-sampled_full_ind_minhash',\n",
427
+ " 'big-run-fineweb-v1-all-dumps',\n",
428
+ " 'big-run-sampled_full_filtered_no_dedup']"
429
+ ]
430
+ },
431
+ "execution_count": 2,
432
+ "metadata": {},
433
+ "output_type": "execute_result"
434
+ }
435
+ ],
436
+ "source": [
437
+ "pd.unique(df[\"runname\"]).tolist()"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 3,
443
+ "id": "b610f43caefdf01",
444
+ "metadata": {
445
+ "ExecuteTime": {
446
+ "end_time": "2024-05-14T09:03:06.294766Z",
447
+ "start_time": "2024-05-14T09:03:06.291388Z"
448
+ },
449
+ "collapsed": false
450
+ },
451
+ "outputs": [],
452
+ "source": [
453
+ "runs_mapping = {\n",
454
+ " # \"big-run-refinedweb\": \"RefinedWeb\",\n",
455
+ " # \"big-run-c4\": \"C4\",\n",
456
+ " \"big-run-sampled_full_filtered_no_dedup\": \"FineWeb: base filtering only\",\n",
457
+ " \"big-run-sampled_full_ind_minhash\": \"FineWeb: independent MinHash (id mh)\",\n",
458
+ " \"big-run-sampled-fineweb-c4-filters\": \"FineWeb: id mh + C4 filters\",\n",
459
+ " \"big-run-fineweb-v1-all-dumps\": \"FineWeb: id mh + C4 + custom filters\",\n",
460
+ "}"
461
+ ]
462
+ },
463
+ {
464
+ "cell_type": "code",
465
+ "execution_count": 6,
466
+ "id": "initial_id",
467
+ "metadata": {
468
+ "ExecuteTime": {
469
+ "end_time": "2024-05-14T09:03:08.298110Z",
470
+ "start_time": "2024-05-14T09:03:08.024839Z"
471
+ },
472
+ "collapsed": true
473
+ },
474
+ "outputs": [],
475
+ "source": [
476
+ "from matplotlib import pyplot as plt\n",
477
+ "import os\n",
478
+ "import json\n",
479
+ "\n",
480
+ "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
481
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
482
+ "\n",
483
+ "def normalize_runname(runname):\n",
484
+ " return runname.replace(\"/\", \"_\")\n",
485
+ "\n",
486
+ "grouped = (\n",
487
+ " df.groupby([\"runname\", \"steps\"])\n",
488
+ " .agg(\n",
489
+ " {\n",
490
+ " key: \"mean\" for key in metrics\n",
491
+ " }\n",
492
+ " )\n",
493
+ " .reset_index()\n",
494
+ ")\n",
495
+ "\n",
496
+ "file_id=\"../assets/data/plots/all_filtering_steps\"\n",
497
+ "files = {}\n",
498
+ "for metric in metrics:\n",
499
+ " datas = {}\n",
500
+ " for name, group in grouped.groupby(\"runname\"):\n",
501
+ " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n",
502
+ " group = group.set_index(\"steps\")\n",
503
+ " rolling_avg = group\n",
504
+ " # rolling_avg = group.rolling(window=5).mean()\n",
505
+ " datas[name] = {\n",
506
+ " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n",
507
+ " \"y\": rolling_avg[metric].tolist(),\n",
508
+ " \"label\": runs_mapping[name],\n",
509
+ " }\n",
510
+ " # Sort the datata based on the steps\n",
511
+ " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n",
512
+ " # Create a folder\n",
513
+ " os.makedirs(f\"{file_id}\", exist_ok=True)\n",
514
+ " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n",
515
+ " json.dump({\n",
516
+ " \"data\": datas,\n",
517
+ " \"layout\": {\n",
518
+ " \"title\": {\n",
519
+ " \"text\": \"The different FineWeb processing steps\"\n",
520
+ " },\n",
521
+ " }\n",
522
+ " }, f)\n",
523
+ " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n",
524
+ "# Create l\n",
525
+ "with open(f\"{file_id}/index.json\", \"w\") as f:\n",
526
+ " json.dump({\n",
527
+ " \"files\": files,\n",
528
+ " \"settings\": {\n",
529
+ " \"defaultMetric\": \"agg_score\",\n",
530
+ " \"slider\":{\"min\":0,\"max\":30,\"default\":5}\n",
531
+ " }\n",
532
+ " }, f)\n",
533
+ " "
534
+ ]
535
+ },
536
+ {
537
+ "cell_type": "code",
538
+ "execution_count": 12,
539
+ "id": "af28ebbd054cdc33",
540
+ "metadata": {
541
+ "ExecuteTime": {
542
+ "end_time": "2024-05-14T08:14:41.132508Z",
543
+ "start_time": "2024-05-14T08:14:41.130025Z"
544
+ },
545
+ "collapsed": false
546
+ },
547
+ "outputs": [],
548
+ "source": []
549
+ },
550
+ {
551
+ "cell_type": "code",
552
+ "execution_count": null,
553
+ "id": "6b8c428e2fedeb1a",
554
+ "metadata": {},
555
+ "outputs": [],
556
+ "source": []
557
+ }
558
+ ],
559
+ "metadata": {
560
+ "kernelspec": {
561
+ "display_name": "Python 3",
562
+ "language": "python",
563
+ "name": "python3"
564
+ },
565
+ "language_info": {
566
+ "codemirror_mode": {
567
+ "name": "ipython",
568
+ "version": 3
569
+ },
570
+ "file_extension": ".py",
571
+ "mimetype": "text/x-python",
572
+ "name": "python",
573
+ "nbconvert_exporter": "python",
574
+ "pygments_lexer": "ipython3",
575
+ "version": "3.12.2"
576
+ }
577
+ },
578
+ "nbformat": 4,
579
+ "nbformat_minor": 5
580
+ }
notebooks/plot_c4_filters_hellaswag.ipynb ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "138889b92720ce2e",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2024-05-13T14:36:31.336129Z",
10
+ "start_time": "2024-05-13T14:36:31.323847Z"
11
+ },
12
+ "collapsed": false
13
+ },
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/html": [
18
+ "<div>\n",
19
+ "<style scoped>\n",
20
+ " .dataframe tbody tr th:only-of-type {\n",
21
+ " vertical-align: middle;\n",
22
+ " }\n",
23
+ "\n",
24
+ " .dataframe tbody tr th {\n",
25
+ " vertical-align: top;\n",
26
+ " }\n",
27
+ "\n",
28
+ " .dataframe thead th {\n",
29
+ " text-align: right;\n",
30
+ " }\n",
31
+ "</style>\n",
32
+ "<table border=\"1\" class=\"dataframe\">\n",
33
+ " <thead>\n",
34
+ " <tr style=\"text-align: right;\">\n",
35
+ " <th></th>\n",
36
+ " <th>runname</th>\n",
37
+ " <th>seed</th>\n",
38
+ " <th>steps</th>\n",
39
+ " <th>agg_score</th>\n",
40
+ " <th>commonsense_qa/acc</th>\n",
41
+ " <th>commonsense_qa/acc_norm</th>\n",
42
+ " <th>hellaswag/acc</th>\n",
43
+ " <th>hellaswag/acc_norm</th>\n",
44
+ " <th>openbookqa/acc</th>\n",
45
+ " <th>openbookqa/acc_norm</th>\n",
46
+ " <th>...</th>\n",
47
+ " <th>siqa/acc</th>\n",
48
+ " <th>siqa/acc_norm</th>\n",
49
+ " <th>winogrande/acc</th>\n",
50
+ " <th>winogrande/acc_norm</th>\n",
51
+ " <th>sciq/acc</th>\n",
52
+ " <th>sciq/acc_norm</th>\n",
53
+ " <th>arc/acc</th>\n",
54
+ " <th>arc/acc_norm</th>\n",
55
+ " <th>mmlu/acc</th>\n",
56
+ " <th>mmlu/acc_norm</th>\n",
57
+ " </tr>\n",
58
+ " </thead>\n",
59
+ " <tbody>\n",
60
+ " <tr>\n",
61
+ " <th>0</th>\n",
62
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
63
+ " <td>5</td>\n",
64
+ " <td>0</td>\n",
65
+ " <td>0.330953</td>\n",
66
+ " <td>0.186</td>\n",
67
+ " <td>0.233</td>\n",
68
+ " <td>0.272</td>\n",
69
+ " <td>0.258</td>\n",
70
+ " <td>0.166</td>\n",
71
+ " <td>0.286</td>\n",
72
+ " <td>...</td>\n",
73
+ " <td>0.367</td>\n",
74
+ " <td>0.362</td>\n",
75
+ " <td>0.516</td>\n",
76
+ " <td>0.497</td>\n",
77
+ " <td>0.210</td>\n",
78
+ " <td>0.202</td>\n",
79
+ " <td>0.2190</td>\n",
80
+ " <td>0.2515</td>\n",
81
+ " <td>0.230285</td>\n",
82
+ " <td>0.250127</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>1</th>\n",
86
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
87
+ " <td>5</td>\n",
88
+ " <td>1000</td>\n",
89
+ " <td>0.357474</td>\n",
90
+ " <td>0.239</td>\n",
91
+ " <td>0.271</td>\n",
92
+ " <td>0.297</td>\n",
93
+ " <td>0.287</td>\n",
94
+ " <td>0.146</td>\n",
95
+ " <td>0.260</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>0.365</td>\n",
98
+ " <td>0.396</td>\n",
99
+ " <td>0.503</td>\n",
100
+ " <td>0.486</td>\n",
101
+ " <td>0.568</td>\n",
102
+ " <td>0.502</td>\n",
103
+ " <td>0.2665</td>\n",
104
+ " <td>0.2855</td>\n",
105
+ " <td>0.242526</td>\n",
106
+ " <td>0.253291</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
111
+ " <td>5</td>\n",
112
+ " <td>2000</td>\n",
113
+ " <td>0.377436</td>\n",
114
+ " <td>0.280</td>\n",
115
+ " <td>0.284</td>\n",
116
+ " <td>0.321</td>\n",
117
+ " <td>0.332</td>\n",
118
+ " <td>0.134</td>\n",
119
+ " <td>0.268</td>\n",
120
+ " <td>...</td>\n",
121
+ " <td>0.368</td>\n",
122
+ " <td>0.399</td>\n",
123
+ " <td>0.519</td>\n",
124
+ " <td>0.502</td>\n",
125
+ " <td>0.686</td>\n",
126
+ " <td>0.590</td>\n",
127
+ " <td>0.3030</td>\n",
128
+ " <td>0.3215</td>\n",
129
+ " <td>0.245745</td>\n",
130
+ " <td>0.260988</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>3</th>\n",
134
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
135
+ " <td>5</td>\n",
136
+ " <td>3000</td>\n",
137
+ " <td>0.387994</td>\n",
138
+ " <td>0.277</td>\n",
139
+ " <td>0.291</td>\n",
140
+ " <td>0.339</td>\n",
141
+ " <td>0.359</td>\n",
142
+ " <td>0.132</td>\n",
143
+ " <td>0.280</td>\n",
144
+ " <td>...</td>\n",
145
+ " <td>0.394</td>\n",
146
+ " <td>0.404</td>\n",
147
+ " <td>0.520</td>\n",
148
+ " <td>0.503</td>\n",
149
+ " <td>0.721</td>\n",
150
+ " <td>0.622</td>\n",
151
+ " <td>0.3210</td>\n",
152
+ " <td>0.3385</td>\n",
153
+ " <td>0.250427</td>\n",
154
+ " <td>0.264451</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>4</th>\n",
158
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
159
+ " <td>5</td>\n",
160
+ " <td>4000</td>\n",
161
+ " <td>0.396110</td>\n",
162
+ " <td>0.299</td>\n",
163
+ " <td>0.315</td>\n",
164
+ " <td>0.340</td>\n",
165
+ " <td>0.366</td>\n",
166
+ " <td>0.158</td>\n",
167
+ " <td>0.286</td>\n",
168
+ " <td>...</td>\n",
169
+ " <td>0.376</td>\n",
170
+ " <td>0.399</td>\n",
171
+ " <td>0.515</td>\n",
172
+ " <td>0.500</td>\n",
173
+ " <td>0.739</td>\n",
174
+ " <td>0.620</td>\n",
175
+ " <td>0.3320</td>\n",
176
+ " <td>0.3445</td>\n",
177
+ " <td>0.256134</td>\n",
178
+ " <td>0.270382</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>...</th>\n",
182
+ " <td>...</td>\n",
183
+ " <td>...</td>\n",
184
+ " <td>...</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " <td>...</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>...</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " </tr>\n",
204
+ " <tr>\n",
205
+ " <th>250</th>\n",
206
+ " <td>sm-baseline-c4</td>\n",
207
+ " <td>6</td>\n",
208
+ " <td>10000</td>\n",
209
+ " <td>0.430443</td>\n",
210
+ " <td>0.335</td>\n",
211
+ " <td>0.326</td>\n",
212
+ " <td>0.379</td>\n",
213
+ " <td>0.474</td>\n",
214
+ " <td>0.176</td>\n",
215
+ " <td>0.340</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>0.385</td>\n",
218
+ " <td>0.406</td>\n",
219
+ " <td>0.525</td>\n",
220
+ " <td>0.523</td>\n",
221
+ " <td>0.767</td>\n",
222
+ " <td>0.675</td>\n",
223
+ " <td>0.3765</td>\n",
224
+ " <td>0.3750</td>\n",
225
+ " <td>0.269139</td>\n",
226
+ " <td>0.280545</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>251</th>\n",
230
+ " <td>sm-baseline-c4</td>\n",
231
+ " <td>6</td>\n",
232
+ " <td>11000</td>\n",
233
+ " <td>0.430776</td>\n",
234
+ " <td>0.341</td>\n",
235
+ " <td>0.323</td>\n",
236
+ " <td>0.391</td>\n",
237
+ " <td>0.481</td>\n",
238
+ " <td>0.192</td>\n",
239
+ " <td>0.346</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>0.390</td>\n",
242
+ " <td>0.405</td>\n",
243
+ " <td>0.531</td>\n",
244
+ " <td>0.515</td>\n",
245
+ " <td>0.766</td>\n",
246
+ " <td>0.676</td>\n",
247
+ " <td>0.3775</td>\n",
248
+ " <td>0.3770</td>\n",
249
+ " <td>0.266895</td>\n",
250
+ " <td>0.281210</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>252</th>\n",
254
+ " <td>sm-baseline-c4</td>\n",
255
+ " <td>6</td>\n",
256
+ " <td>12000</td>\n",
257
+ " <td>0.430352</td>\n",
258
+ " <td>0.340</td>\n",
259
+ " <td>0.319</td>\n",
260
+ " <td>0.392</td>\n",
261
+ " <td>0.475</td>\n",
262
+ " <td>0.192</td>\n",
263
+ " <td>0.342</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>0.377</td>\n",
266
+ " <td>0.395</td>\n",
267
+ " <td>0.528</td>\n",
268
+ " <td>0.518</td>\n",
269
+ " <td>0.785</td>\n",
270
+ " <td>0.688</td>\n",
271
+ " <td>0.3755</td>\n",
272
+ " <td>0.3840</td>\n",
273
+ " <td>0.267159</td>\n",
274
+ " <td>0.279819</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>253</th>\n",
278
+ " <td>sm-baseline-c4</td>\n",
279
+ " <td>6</td>\n",
280
+ " <td>13000</td>\n",
281
+ " <td>0.432136</td>\n",
282
+ " <td>0.339</td>\n",
283
+ " <td>0.326</td>\n",
284
+ " <td>0.395</td>\n",
285
+ " <td>0.477</td>\n",
286
+ " <td>0.198</td>\n",
287
+ " <td>0.348</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>0.390</td>\n",
290
+ " <td>0.405</td>\n",
291
+ " <td>0.529</td>\n",
292
+ " <td>0.518</td>\n",
293
+ " <td>0.785</td>\n",
294
+ " <td>0.682</td>\n",
295
+ " <td>0.3780</td>\n",
296
+ " <td>0.3825</td>\n",
297
+ " <td>0.269719</td>\n",
298
+ " <td>0.281585</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>254</th>\n",
302
+ " <td>sm-baseline-c4</td>\n",
303
+ " <td>6</td>\n",
304
+ " <td>13500</td>\n",
305
+ " <td>0.433866</td>\n",
306
+ " <td>0.344</td>\n",
307
+ " <td>0.328</td>\n",
308
+ " <td>0.394</td>\n",
309
+ " <td>0.484</td>\n",
310
+ " <td>0.198</td>\n",
311
+ " <td>0.334</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>0.388</td>\n",
314
+ " <td>0.406</td>\n",
315
+ " <td>0.531</td>\n",
316
+ " <td>0.523</td>\n",
317
+ " <td>0.778</td>\n",
318
+ " <td>0.682</td>\n",
319
+ " <td>0.3795</td>\n",
320
+ " <td>0.3845</td>\n",
321
+ " <td>0.269601</td>\n",
322
+ " <td>0.284425</td>\n",
323
+ " </tr>\n",
324
+ " </tbody>\n",
325
+ "</table>\n",
326
+ "<p>255 rows × 22 columns</p>\n",
327
+ "</div>"
328
+ ],
329
+ "text/plain": [
330
+ " runname seed steps agg_score \\\n",
331
+ "0 filtering-baseline-2019-18-40gt 5 0 0.330953 \n",
332
+ "1 filtering-baseline-2019-18-40gt 5 1000 0.357474 \n",
333
+ "2 filtering-baseline-2019-18-40gt 5 2000 0.377436 \n",
334
+ "3 filtering-baseline-2019-18-40gt 5 3000 0.387994 \n",
335
+ "4 filtering-baseline-2019-18-40gt 5 4000 0.396110 \n",
336
+ ".. ... ... ... ... \n",
337
+ "250 sm-baseline-c4 6 10000 0.430443 \n",
338
+ "251 sm-baseline-c4 6 11000 0.430776 \n",
339
+ "252 sm-baseline-c4 6 12000 0.430352 \n",
340
+ "253 sm-baseline-c4 6 13000 0.432136 \n",
341
+ "254 sm-baseline-c4 6 13500 0.433866 \n",
342
+ "\n",
343
+ " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n",
344
+ "0 0.186 0.233 0.272 \n",
345
+ "1 0.239 0.271 0.297 \n",
346
+ "2 0.280 0.284 0.321 \n",
347
+ "3 0.277 0.291 0.339 \n",
348
+ "4 0.299 0.315 0.340 \n",
349
+ ".. ... ... ... \n",
350
+ "250 0.335 0.326 0.379 \n",
351
+ "251 0.341 0.323 0.391 \n",
352
+ "252 0.340 0.319 0.392 \n",
353
+ "253 0.339 0.326 0.395 \n",
354
+ "254 0.344 0.328 0.394 \n",
355
+ "\n",
356
+ " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n",
357
+ "0 0.258 0.166 0.286 ... 0.367 \n",
358
+ "1 0.287 0.146 0.260 ... 0.365 \n",
359
+ "2 0.332 0.134 0.268 ... 0.368 \n",
360
+ "3 0.359 0.132 0.280 ... 0.394 \n",
361
+ "4 0.366 0.158 0.286 ... 0.376 \n",
362
+ ".. ... ... ... ... ... \n",
363
+ "250 0.474 0.176 0.340 ... 0.385 \n",
364
+ "251 0.481 0.192 0.346 ... 0.390 \n",
365
+ "252 0.475 0.192 0.342 ... 0.377 \n",
366
+ "253 0.477 0.198 0.348 ... 0.390 \n",
367
+ "254 0.484 0.198 0.334 ... 0.388 \n",
368
+ "\n",
369
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
370
+ "0 0.362 0.516 0.497 0.210 \n",
371
+ "1 0.396 0.503 0.486 0.568 \n",
372
+ "2 0.399 0.519 0.502 0.686 \n",
373
+ "3 0.404 0.520 0.503 0.721 \n",
374
+ "4 0.399 0.515 0.500 0.739 \n",
375
+ ".. ... ... ... ... \n",
376
+ "250 0.406 0.525 0.523 0.767 \n",
377
+ "251 0.405 0.531 0.515 0.766 \n",
378
+ "252 0.395 0.528 0.518 0.785 \n",
379
+ "253 0.405 0.529 0.518 0.785 \n",
380
+ "254 0.406 0.531 0.523 0.778 \n",
381
+ "\n",
382
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
383
+ "0 0.202 0.2190 0.2515 0.230285 0.250127 \n",
384
+ "1 0.502 0.2665 0.2855 0.242526 0.253291 \n",
385
+ "2 0.590 0.3030 0.3215 0.245745 0.260988 \n",
386
+ "3 0.622 0.3210 0.3385 0.250427 0.264451 \n",
387
+ "4 0.620 0.3320 0.3445 0.256134 0.270382 \n",
388
+ ".. ... ... ... ... ... \n",
389
+ "250 0.675 0.3765 0.3750 0.269139 0.280545 \n",
390
+ "251 0.676 0.3775 0.3770 0.266895 0.281210 \n",
391
+ "252 0.688 0.3755 0.3840 0.267159 0.279819 \n",
392
+ "253 0.682 0.3780 0.3825 0.269719 0.281585 \n",
393
+ "254 0.682 0.3795 0.3845 0.269601 0.284425 \n",
394
+ "\n",
395
+ "[255 rows x 22 columns]"
396
+ ]
397
+ },
398
+ "execution_count": 2,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "import pandas as pd\n",
405
+ "from matplotlib.figure import Figure\n",
406
+ "\n",
407
+ "df = pd.read_csv(\"../src_data/c4-filters.csv\")\n",
408
+ "df"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 3,
414
+ "id": "839a06a71d9183e5",
415
+ "metadata": {
416
+ "ExecuteTime": {
417
+ "end_time": "2024-05-13T14:36:32.338012Z",
418
+ "start_time": "2024-05-13T14:36:32.335209Z"
419
+ }
420
+ },
421
+ "outputs": [
422
+ {
423
+ "data": {
424
+ "text/plain": [
425
+ "['filtering-baseline-2019-18-40gt',\n",
426
+ " 'filtering-baseline-2019-18-60gt',\n",
427
+ " 'filtering-c4-all-except-terminal_punct',\n",
428
+ " 'filtering-c4-all',\n",
429
+ " 'filtering-c4-curly_bracket',\n",
430
+ " 'filtering-c4-terminal_punct',\n",
431
+ " 'filtering-c4-word_lengths',\n",
432
+ " 'sm-baseline-c4']"
433
+ ]
434
+ },
435
+ "execution_count": 3,
436
+ "metadata": {},
437
+ "output_type": "execute_result"
438
+ }
439
+ ],
440
+ "source": [
441
+ "pd.unique(df[\"runname\"]).tolist()"
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": 4,
447
+ "id": "b610f43caefdf01",
448
+ "metadata": {
449
+ "ExecuteTime": {
450
+ "end_time": "2024-05-13T16:06:36.968532Z",
451
+ "start_time": "2024-05-13T16:06:36.966172Z"
452
+ },
453
+ "collapsed": false
454
+ },
455
+ "outputs": [],
456
+ "source": [
457
+ "runs_mapping = {\n",
458
+ " # 'filtering-baseline-2019-18-40gt': \"baseline\",\n",
459
+ " 'filtering-baseline-2019-18-60gt': \"baseline\",\n",
460
+ " 'filtering-c4-curly_bracket': \"curly_bracket filter\",\n",
461
+ " 'filtering-c4-terminal_punct': \"terminal_punct filter\",\n",
462
+ " 'filtering-c4-word_lengths': \"word_lengths filter\",\n",
463
+ " 'filtering-c4-all': \"All filters\",\n",
464
+ " 'filtering-c4-all-except-terminal_punct': \"All filters except terminal_punct\",\n",
465
+ " 'sm-baseline-c4': \"C4\"\n",
466
+ "}"
467
+ ]
468
+ },
469
+ {
470
+ "cell_type": "code",
471
+ "execution_count": 6,
472
+ "id": "initial_id",
473
+ "metadata": {
474
+ "ExecuteTime": {
475
+ "end_time": "2024-05-13T16:06:37.459935Z",
476
+ "start_time": "2024-05-13T16:06:37.181024Z"
477
+ },
478
+ "collapsed": true
479
+ },
480
+ "outputs": [],
481
+ "source": [
482
+ "from matplotlib import pyplot as plt\n",
483
+ "\n",
484
+ "\n",
485
+ "import json\n",
486
+ "import os\n",
487
+ "from matplotlib import pyplot as plt\n",
488
+ "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
489
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
490
+ "\n",
491
+ "def normalize_runname(runname):\n",
492
+ " return runname.replace(\"/\", \"_\")\n",
493
+ "\n",
494
+ "grouped = (\n",
495
+ " df.groupby([\"runname\", \"steps\"])\n",
496
+ " .agg(\n",
497
+ " {\n",
498
+ " key: \"mean\" for key in metrics\n",
499
+ " }\n",
500
+ " )\n",
501
+ " .reset_index()\n",
502
+ ")\n",
503
+ "\n",
504
+ "file_id=\"../assets/data/plots/c4_filters_hellaswag\"\n",
505
+ "files = {}\n",
506
+ "for metric in metrics:\n",
507
+ " datas = {}\n",
508
+ " for name, group in grouped.groupby(\"runname\"):\n",
509
+ " if name not in runs_mapping:\n",
510
+ " continue\n",
511
+ " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n",
512
+ " group = group.set_index(\"steps\")\n",
513
+ " rolling_avg = group\n",
514
+ " datas[name] = {\n",
515
+ " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n",
516
+ " \"y\": rolling_avg[metric].tolist(),\n",
517
+ " \"label\": runs_mapping[name],\n",
518
+ " }\n",
519
+ " # Sort the datata based on the steps\n",
520
+ " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n",
521
+ " # Create a folder\n",
522
+ " os.makedirs(f\"{file_id}\", exist_ok=True)\n",
523
+ " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n",
524
+ " json.dump({\n",
525
+ " \"data\": datas,\n",
526
+ " \"layout\": {\n",
527
+ " \"title\": {\n",
528
+ " \"text\": \"C4 filtering effect on HellaSwag\"\n",
529
+ " },\n",
530
+ " }\n",
531
+ " }, f)\n",
532
+ " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n",
533
+ "# Create index\n",
534
+ "with open(f\"{file_id}/index.json\", \"w\") as f:\n",
535
+ " json.dump({\n",
536
+ " \"files\": files,\n",
537
+ " \"settings\": {\n",
538
+ " \"defaultMetric\": \"hellaswag/acc_norm\",\n",
539
+ " \"slider\":{\"min\":0,\"max\":10,\"default\":3}\n",
540
+ " }\n",
541
+ " }, f)"
542
+ ]
543
+ },
544
+ {
545
+ "cell_type": "code",
546
+ "execution_count": 3,
547
+ "id": "af28ebbd054cdc33",
548
+ "metadata": {
549
+ "ExecuteTime": {
550
+ "end_time": "2024-04-30T12:52:05.836260Z",
551
+ "start_time": "2024-04-30T12:52:05.834381Z"
552
+ },
553
+ "collapsed": false
554
+ },
555
+ "outputs": [],
556
+ "source": []
557
+ }
558
+ ],
559
+ "metadata": {
560
+ "kernelspec": {
561
+ "display_name": "Python 3",
562
+ "language": "python",
563
+ "name": "python3"
564
+ },
565
+ "language_info": {
566
+ "codemirror_mode": {
567
+ "name": "ipython",
568
+ "version": 3
569
+ },
570
+ "file_extension": ".py",
571
+ "mimetype": "text/x-python",
572
+ "name": "python",
573
+ "nbconvert_exporter": "python",
574
+ "pygments_lexer": "ipython3",
575
+ "version": "3.12.2"
576
+ }
577
+ },
578
+ "nbformat": 4,
579
+ "nbformat_minor": 5
580
+ }
notebooks/plot_commoncrawl_dumps.ipynb ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "id": "initial_id",
6
+ "metadata": {
7
+ "collapsed": true,
8
+ "ExecuteTime": {
9
+ "end_time": "2024-05-14T09:57:03.097798Z",
10
+ "start_time": "2024-05-14T09:57:02.853658Z"
11
+ }
12
+ },
13
+ "source": [
14
+ "import pandas as pd"
15
+ ],
16
+ "execution_count": 2,
17
+ "outputs": []
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "source": [
22
+ "df = pd.read_csv(\"/home/gui/hf_dev/datatrove/blogpost/data/commoncrawl_dumps.csv\")"
23
+ ],
24
+ "metadata": {
25
+ "collapsed": false,
26
+ "ExecuteTime": {
27
+ "end_time": "2024-05-14T09:57:03.110303Z",
28
+ "start_time": "2024-05-14T09:57:03.098988Z"
29
+ }
30
+ },
31
+ "id": "157e18836c20793c",
32
+ "execution_count": 3,
33
+ "outputs": []
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "source": [
38
+ "grouped = df.groupby('runname')\n",
39
+ "\n",
40
+ "# Define a function to take the top 6 rows of each group\n",
41
+ "def top_6_avg(group):\n",
42
+ " # Sort the group by \"steps\" in descending order\n",
43
+ " sorted_group = group.sort_values(by='steps', ascending=False)\n",
44
+ " # Take the top 6 rows\n",
45
+ " top_6 = sorted_group.head(6)\n",
46
+ " # Calculate the average of \"agg_score\"\n",
47
+ " avg_score = top_6['agg_score'].mean()\n",
48
+ " return avg_score\n",
49
+ "\n",
50
+ "def top_6_stats(group):\n",
51
+ " # Sort the group by \"steps\" in descending order\n",
52
+ " sorted_group = group.sort_values(by='steps', ascending=False)\n",
53
+ " # Take the top 6 rows\n",
54
+ " top_6 = sorted_group.head(6)\n",
55
+ " # Calculate the average of \"agg_score\"\n",
56
+ " avg_score = top_6['agg_score'].mean()\n",
57
+ " # Calculate the standard deviation of \"agg_score\"\n",
58
+ " std_dev = top_6['agg_score'].std()\n",
59
+ " return pd.Series({'avg': avg_score, 'std_dev': std_dev})\n",
60
+ "\n",
61
+ "# Apply the function to each group and aggregate the results\n",
62
+ "result = grouped.apply(top_6_stats)"
63
+ ],
64
+ "metadata": {
65
+ "collapsed": false,
66
+ "ExecuteTime": {
67
+ "end_time": "2024-05-14T09:57:03.227764Z",
68
+ "start_time": "2024-05-14T09:57:03.183929Z"
69
+ }
70
+ },
71
+ "id": "af7c0416a6371f9a",
72
+ "execution_count": 4,
73
+ "outputs": []
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "source": [
78
+ "result"
79
+ ],
80
+ "metadata": {
81
+ "collapsed": false,
82
+ "ExecuteTime": {
83
+ "end_time": "2024-05-14T09:57:03.784515Z",
84
+ "start_time": "2024-05-14T09:57:03.775829Z"
85
+ }
86
+ },
87
+ "id": "65c0cd58c6f9f9d6",
88
+ "execution_count": 5,
89
+ "outputs": []
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "source": [
94
+ "import numpy as np\n",
95
+ "import matplotlib\n",
96
+ "import matplotlib.pyplot as plt\n",
97
+ "import matplotlib.colors as mcolors\n",
98
+ "\n",
99
+ "# Assuming you have already computed the result DataFrame\n",
100
+ "\n",
101
+ "# Sort the result DataFrame by \"runname\"\n",
102
+ "result_sorted = result.sort_index()\n",
103
+ "colors = result_sorted.index.str.split('-').str[0].astype(int)\n",
104
+ "\n",
105
+ "cmap = plt.cm.tab10\n",
106
+ "\n",
107
+ "# Create a new colormap without transparency\n",
108
+ "new_colors = cmap(np.linspace(0, 1, cmap.N))\n",
109
+ "new_colors = np.concatenate((new_colors[-2:], new_colors))\n",
110
+ "new_cmap = mcolors.ListedColormap(new_colors)\n",
111
+ "rgba_colors = new_cmap(new_colors)\n",
112
+ "\n",
113
+ "\n",
114
+ "# Plotting\n",
115
+ "plt.figure(figsize=(15, 10))\n",
116
+ "# Join the points with a line\n",
117
+ "plt.plot(range(len(result_sorted)), result_sorted[\"avg\"], linestyle='-', color='gray', alpha=0.5, zorder=1)\n",
118
+ "scatter = plt.scatter(range(len(result_sorted)), result_sorted[\"avg\"], c=colors, cmap=new_cmap, marker='o', s=100, zorder=2)\n",
119
+ "\n",
120
+ "norm = plt.Normalize(min(colors), max(colors))\n",
121
+ "\n",
122
+ "import matplotlib.cm as cm\n",
123
+ "# Creating a ScalarMappable object with the tab20 colormap and normalization\n",
124
+ "sm = cm.ScalarMappable(cmap=new_cmap, norm=norm)\n",
125
+ "\n",
126
+ "plt.xlabel('Year', fontsize=18)\n",
127
+ "plt.ylabel('Average Agg Score', fontsize=18)\n",
128
+ "plt.title('Score by dump', fontsize=24)\n",
129
+ "plt.xticks(range(len(result_sorted)), colors, ha='center', fontsize=14)\n",
130
+ "plt.yticks(fontsize=14)\n",
131
+ "ax = plt.gca()\n",
132
+ "\n",
133
+ "# for i in range(len(result_sorted)):\n",
134
+ "# plt.errorbar(i, result_sorted.iloc[i]['avg'], yerr=result_sorted.iloc[i]['std_dev'], fmt='o', color=sm.to_rgba(colors[i]), markersize=0, capsize=5)\n",
135
+ "prev = None\n",
136
+ "labels = ax.xaxis.get_ticklabels()\n",
137
+ "# labels[0].set_horizontalalignment('right')\n",
138
+ "lines = []\n",
139
+ "for x, name in enumerate(colors.tolist()):\n",
140
+ " if name != prev:\n",
141
+ " plt.axvline(x=x, color='grey', linestyle=':')\n",
142
+ " lines.append(x)\n",
143
+ " prev = name\n",
144
+ "\n",
145
+ "mids = np.floor((np.array(lines[:-1]) + np.array(lines[1:])) / 2)\n",
146
+ "for x in range(len(colors) - 1):\n",
147
+ " if x not in mids:\n",
148
+ " labels[x].set_visible(False)\n",
149
+ "labels[-1].set_horizontalalignment('left')\n",
150
+ " \n",
151
+ "\n",
152
+ "# plt.grid(True)\n",
153
+ "plt.savefig(\"/home/gui/hf_dev/datatrove/blogpost/plots/score_by_dump.png\", bbox_inches='tight', dpi=300)\n",
154
+ "plt.show()"
155
+ ],
156
+ "metadata": {
157
+ "collapsed": false,
158
+ "ExecuteTime": {
159
+ "end_time": "2024-05-14T12:33:41.469562Z",
160
+ "start_time": "2024-05-14T12:33:40.411105Z"
161
+ }
162
+ },
163
+ "id": "412ed6b4570d10e9",
164
+ "execution_count": 98,
165
+ "outputs": []
166
+ },
167
+ {
168
+ "metadata": {
169
+ "ExecuteTime": {
170
+ "end_time": "2024-05-14T12:18:06.365519Z",
171
+ "start_time": "2024-05-14T12:18:06.360995Z"
172
+ }
173
+ },
174
+ "cell_type": "code",
175
+ "source": [
176
+ " \n",
177
+ "new_colors = cmap(np.linspace(0, 1, cmap.N))\n",
178
+ "new_colors = np.concatenate((new_colors[-2:], new_colors))\n",
179
+ "mcolors.ListedColormap(new_colors)"
180
+ ],
181
+ "id": "270bd97983706aee",
182
+ "execution_count": 85,
183
+ "outputs": []
184
+ },
185
+ {
186
+ "metadata": {
187
+ "ExecuteTime": {
188
+ "end_time": "2024-05-14T12:13:03.523524Z",
189
+ "start_time": "2024-05-14T12:13:03.518910Z"
190
+ }
191
+ },
192
+ "cell_type": "code",
193
+ "source": "new_cmap",
194
+ "id": "ae52ddd47cf306a1",
195
+ "execution_count": 76,
196
+ "outputs": []
197
+ },
198
+ {
199
+ "metadata": {},
200
+ "cell_type": "markdown",
201
+ "source": "Flipped axis",
202
+ "id": "dd4bbdf230df5953"
203
+ },
204
+ {
205
+ "metadata": {
206
+ "ExecuteTime": {
207
+ "end_time": "2024-05-14T10:16:00.731056Z",
208
+ "start_time": "2024-05-14T10:15:59.648467Z"
209
+ }
210
+ },
211
+ "cell_type": "code",
212
+ "source": [
213
+ "import matplotlib.pyplot as plt\n",
214
+ "\n",
215
+ "# Assuming you have already computed the result DataFrame\n",
216
+ "\n",
217
+ "# Sort the result DataFrame by \"runname\"\n",
218
+ "result_sorted = result.sort_index()\n",
219
+ "colors = result_sorted.index.str.split('-').str[0].astype(int)\n",
220
+ "\n",
221
+ "rgba_colors = plt.cm.tab20(colors)\n",
222
+ "# Plotting\n",
223
+ "plt.figure(figsize=(10, 20))\n",
224
+ "scatter = plt.scatter(result_sorted[\"avg\"], range(len(result_sorted)), c=colors, cmap='tab20', marker='o', s=100)\n",
225
+ "# Join the points with a line\n",
226
+ "plt.plot(result_sorted[\"avg\"], range(len(result_sorted)), linestyle='-', color='gray', alpha=0.5)\n",
227
+ "\n",
228
+ "norm = plt.Normalize(min(colors), max(colors))\n",
229
+ "\n",
230
+ "import matplotlib.cm as cm\n",
231
+ "\n",
232
+ "# Creating a ScalarMappable object with the tab20 colormap and normalization\n",
233
+ "sm = cm.ScalarMappable(cmap='tab20', norm=norm)\n",
234
+ "\n",
235
+ "plt.xlabel('Dump')\n",
236
+ "plt.ylabel('Average Agg Score')\n",
237
+ "plt.title('Score by dump. 3 last checkpoints of each seed avgd')\n",
238
+ "plt.yticks(range(len(result_sorted)), result_sorted.index, ha='right', rotation_mode='anchor')\n",
239
+ "ax = plt.gca()\n",
240
+ "\n",
241
+ "# for i in range(len(result_sorted)):\n",
242
+ "# plt.errorbar(i, result_sorted.iloc[i]['avg'], yerr=result_sorted.iloc[i]['std_dev'], fmt='o', color=sm.to_rgba(colors[i]), markersize=0, capsize=5)\n",
243
+ "# for label in ax.xaxis.get_ticklabels()[1::2]:\n",
244
+ "# label.set_visible(False)\n",
245
+ "\n",
246
+ "plt.grid(True)\n",
247
+ "plt.savefig(\"/home/gui/hf_dev/datatrove/blogpost/plots/score_by_dump.png\", bbox_inches='tight', dpi=300)\n",
248
+ "plt.show()\n"
249
+ ],
250
+ "id": "49656c68704a55ca",
251
+ "execution_count": 36,
252
+ "outputs": []
253
+ },
254
+ {
255
+ "metadata": {},
256
+ "cell_type": "code",
257
+ "execution_count": null,
258
+ "source": "",
259
+ "id": "1872a68fa04b776d",
260
+ "outputs": []
261
+ }
262
+ ],
263
+ "metadata": {
264
+ "kernelspec": {
265
+ "display_name": "Python 3",
266
+ "language": "python",
267
+ "name": "python3"
268
+ },
269
+ "language_info": {
270
+ "codemirror_mode": {
271
+ "name": "ipython",
272
+ "version": 2
273
+ },
274
+ "file_extension": ".py",
275
+ "mimetype": "text/x-python",
276
+ "name": "python",
277
+ "nbconvert_exporter": "python",
278
+ "pygments_lexer": "ipython2",
279
+ "version": "2.7.6"
280
+ }
281
+ },
282
+ "nbformat": 4,
283
+ "nbformat_minor": 5
284
+ }
notebooks/plot_commoncrawl_dumps_fixed.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/plot_custom_filters.ipynb ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 6,
6
+ "id": "138889b92720ce2e",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2024-05-14T09:06:04.487186Z",
10
+ "start_time": "2024-05-14T09:06:04.255111Z"
11
+ },
12
+ "collapsed": false
13
+ },
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/html": [
18
+ "<div>\n",
19
+ "<style scoped>\n",
20
+ " .dataframe tbody tr th:only-of-type {\n",
21
+ " vertical-align: middle;\n",
22
+ " }\n",
23
+ "\n",
24
+ " .dataframe tbody tr th {\n",
25
+ " vertical-align: top;\n",
26
+ " }\n",
27
+ "\n",
28
+ " .dataframe thead th {\n",
29
+ " text-align: right;\n",
30
+ " }\n",
31
+ "</style>\n",
32
+ "<table border=\"1\" class=\"dataframe\">\n",
33
+ " <thead>\n",
34
+ " <tr style=\"text-align: right;\">\n",
35
+ " <th></th>\n",
36
+ " <th>runname</th>\n",
37
+ " <th>seed</th>\n",
38
+ " <th>steps</th>\n",
39
+ " <th>agg_score</th>\n",
40
+ " <th>commonsense_qa/acc</th>\n",
41
+ " <th>commonsense_qa/acc_norm</th>\n",
42
+ " <th>hellaswag/acc</th>\n",
43
+ " <th>hellaswag/acc_norm</th>\n",
44
+ " <th>openbookqa/acc</th>\n",
45
+ " <th>openbookqa/acc_norm</th>\n",
46
+ " <th>...</th>\n",
47
+ " <th>siqa/acc</th>\n",
48
+ " <th>siqa/acc_norm</th>\n",
49
+ " <th>winogrande/acc</th>\n",
50
+ " <th>winogrande/acc_norm</th>\n",
51
+ " <th>sciq/acc</th>\n",
52
+ " <th>sciq/acc_norm</th>\n",
53
+ " <th>arc/acc</th>\n",
54
+ " <th>arc/acc_norm</th>\n",
55
+ " <th>mmlu/acc</th>\n",
56
+ " <th>mmlu/acc_norm</th>\n",
57
+ " </tr>\n",
58
+ " </thead>\n",
59
+ " <tbody>\n",
60
+ " <tr>\n",
61
+ " <th>0</th>\n",
62
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
63
+ " <td>5</td>\n",
64
+ " <td>0</td>\n",
65
+ " <td>0.330953</td>\n",
66
+ " <td>0.186</td>\n",
67
+ " <td>0.233</td>\n",
68
+ " <td>0.272</td>\n",
69
+ " <td>0.258</td>\n",
70
+ " <td>0.166</td>\n",
71
+ " <td>0.286</td>\n",
72
+ " <td>...</td>\n",
73
+ " <td>0.367</td>\n",
74
+ " <td>0.362</td>\n",
75
+ " <td>0.516</td>\n",
76
+ " <td>0.497</td>\n",
77
+ " <td>0.210</td>\n",
78
+ " <td>0.202</td>\n",
79
+ " <td>0.2190</td>\n",
80
+ " <td>0.2515</td>\n",
81
+ " <td>0.230285</td>\n",
82
+ " <td>0.250127</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>1</th>\n",
86
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
87
+ " <td>5</td>\n",
88
+ " <td>1000</td>\n",
89
+ " <td>0.357474</td>\n",
90
+ " <td>0.239</td>\n",
91
+ " <td>0.271</td>\n",
92
+ " <td>0.297</td>\n",
93
+ " <td>0.287</td>\n",
94
+ " <td>0.146</td>\n",
95
+ " <td>0.260</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>0.365</td>\n",
98
+ " <td>0.396</td>\n",
99
+ " <td>0.503</td>\n",
100
+ " <td>0.486</td>\n",
101
+ " <td>0.568</td>\n",
102
+ " <td>0.502</td>\n",
103
+ " <td>0.2665</td>\n",
104
+ " <td>0.2855</td>\n",
105
+ " <td>0.242526</td>\n",
106
+ " <td>0.253291</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
111
+ " <td>5</td>\n",
112
+ " <td>2000</td>\n",
113
+ " <td>0.377436</td>\n",
114
+ " <td>0.280</td>\n",
115
+ " <td>0.284</td>\n",
116
+ " <td>0.321</td>\n",
117
+ " <td>0.332</td>\n",
118
+ " <td>0.134</td>\n",
119
+ " <td>0.268</td>\n",
120
+ " <td>...</td>\n",
121
+ " <td>0.368</td>\n",
122
+ " <td>0.399</td>\n",
123
+ " <td>0.519</td>\n",
124
+ " <td>0.502</td>\n",
125
+ " <td>0.686</td>\n",
126
+ " <td>0.590</td>\n",
127
+ " <td>0.3030</td>\n",
128
+ " <td>0.3215</td>\n",
129
+ " <td>0.245745</td>\n",
130
+ " <td>0.260988</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>3</th>\n",
134
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
135
+ " <td>5</td>\n",
136
+ " <td>3000</td>\n",
137
+ " <td>0.387994</td>\n",
138
+ " <td>0.277</td>\n",
139
+ " <td>0.291</td>\n",
140
+ " <td>0.339</td>\n",
141
+ " <td>0.359</td>\n",
142
+ " <td>0.132</td>\n",
143
+ " <td>0.280</td>\n",
144
+ " <td>...</td>\n",
145
+ " <td>0.394</td>\n",
146
+ " <td>0.404</td>\n",
147
+ " <td>0.520</td>\n",
148
+ " <td>0.503</td>\n",
149
+ " <td>0.721</td>\n",
150
+ " <td>0.622</td>\n",
151
+ " <td>0.3210</td>\n",
152
+ " <td>0.3385</td>\n",
153
+ " <td>0.250427</td>\n",
154
+ " <td>0.264451</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>4</th>\n",
158
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
159
+ " <td>5</td>\n",
160
+ " <td>4000</td>\n",
161
+ " <td>0.396110</td>\n",
162
+ " <td>0.299</td>\n",
163
+ " <td>0.315</td>\n",
164
+ " <td>0.340</td>\n",
165
+ " <td>0.366</td>\n",
166
+ " <td>0.158</td>\n",
167
+ " <td>0.286</td>\n",
168
+ " <td>...</td>\n",
169
+ " <td>0.376</td>\n",
170
+ " <td>0.399</td>\n",
171
+ " <td>0.515</td>\n",
172
+ " <td>0.500</td>\n",
173
+ " <td>0.739</td>\n",
174
+ " <td>0.620</td>\n",
175
+ " <td>0.3320</td>\n",
176
+ " <td>0.3445</td>\n",
177
+ " <td>0.256134</td>\n",
178
+ " <td>0.270382</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>...</th>\n",
182
+ " <td>...</td>\n",
183
+ " <td>...</td>\n",
184
+ " <td>...</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " <td>...</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>...</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " </tr>\n",
204
+ " <tr>\n",
205
+ " <th>129</th>\n",
206
+ " <td>filtering-custom-short-line-ratio-0.67</td>\n",
207
+ " <td>6</td>\n",
208
+ " <td>10000</td>\n",
209
+ " <td>0.422300</td>\n",
210
+ " <td>0.333</td>\n",
211
+ " <td>0.341</td>\n",
212
+ " <td>0.382</td>\n",
213
+ " <td>0.417</td>\n",
214
+ " <td>0.192</td>\n",
215
+ " <td>0.318</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>0.389</td>\n",
218
+ " <td>0.407</td>\n",
219
+ " <td>0.536</td>\n",
220
+ " <td>0.530</td>\n",
221
+ " <td>NaN</td>\n",
222
+ " <td>NaN</td>\n",
223
+ " <td>0.3630</td>\n",
224
+ " <td>0.3700</td>\n",
225
+ " <td>0.266752</td>\n",
226
+ " <td>0.284400</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>130</th>\n",
230
+ " <td>filtering-custom-short-line-ratio-0.67</td>\n",
231
+ " <td>6</td>\n",
232
+ " <td>11000</td>\n",
233
+ " <td>0.425840</td>\n",
234
+ " <td>0.345</td>\n",
235
+ " <td>0.340</td>\n",
236
+ " <td>0.395</td>\n",
237
+ " <td>0.432</td>\n",
238
+ " <td>0.192</td>\n",
239
+ " <td>0.322</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>0.379</td>\n",
242
+ " <td>0.405</td>\n",
243
+ " <td>0.527</td>\n",
244
+ " <td>0.531</td>\n",
245
+ " <td>NaN</td>\n",
246
+ " <td>NaN</td>\n",
247
+ " <td>0.3680</td>\n",
248
+ " <td>0.3745</td>\n",
249
+ " <td>0.267998</td>\n",
250
+ " <td>0.282222</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>131</th>\n",
254
+ " <td>filtering-custom-short-line-ratio-0.67</td>\n",
255
+ " <td>6</td>\n",
256
+ " <td>12000</td>\n",
257
+ " <td>0.427343</td>\n",
258
+ " <td>0.339</td>\n",
259
+ " <td>0.348</td>\n",
260
+ " <td>0.397</td>\n",
261
+ " <td>0.439</td>\n",
262
+ " <td>0.198</td>\n",
263
+ " <td>0.316</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>0.382</td>\n",
266
+ " <td>0.402</td>\n",
267
+ " <td>0.535</td>\n",
268
+ " <td>0.536</td>\n",
269
+ " <td>NaN</td>\n",
270
+ " <td>NaN</td>\n",
271
+ " <td>0.3705</td>\n",
272
+ " <td>0.3795</td>\n",
273
+ " <td>0.268891</td>\n",
274
+ " <td>0.283246</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>132</th>\n",
278
+ " <td>filtering-custom-short-line-ratio-0.67</td>\n",
279
+ " <td>6</td>\n",
280
+ " <td>13000</td>\n",
281
+ " <td>0.429031</td>\n",
282
+ " <td>0.338</td>\n",
283
+ " <td>0.338</td>\n",
284
+ " <td>0.398</td>\n",
285
+ " <td>0.449</td>\n",
286
+ " <td>0.194</td>\n",
287
+ " <td>0.326</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>0.384</td>\n",
290
+ " <td>0.406</td>\n",
291
+ " <td>0.539</td>\n",
292
+ " <td>0.534</td>\n",
293
+ " <td>NaN</td>\n",
294
+ " <td>NaN</td>\n",
295
+ " <td>0.3655</td>\n",
296
+ " <td>0.3775</td>\n",
297
+ " <td>0.271709</td>\n",
298
+ " <td>0.282748</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>133</th>\n",
302
+ " <td>filtering-custom-short-line-ratio-0.67</td>\n",
303
+ " <td>6</td>\n",
304
+ " <td>13500</td>\n",
305
+ " <td>0.428488</td>\n",
306
+ " <td>0.346</td>\n",
307
+ " <td>0.340</td>\n",
308
+ " <td>0.398</td>\n",
309
+ " <td>0.447</td>\n",
310
+ " <td>0.188</td>\n",
311
+ " <td>0.332</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>0.382</td>\n",
314
+ " <td>0.404</td>\n",
315
+ " <td>0.527</td>\n",
316
+ " <td>0.527</td>\n",
317
+ " <td>NaN</td>\n",
318
+ " <td>NaN</td>\n",
319
+ " <td>0.3720</td>\n",
320
+ " <td>0.3730</td>\n",
321
+ " <td>0.272315</td>\n",
322
+ " <td>0.283901</td>\n",
323
+ " </tr>\n",
324
+ " </tbody>\n",
325
+ "</table>\n",
326
+ "<p>134 rows × 22 columns</p>\n",
327
+ "</div>"
328
+ ],
329
+ "text/plain": [
330
+ " runname seed steps agg_score \\\n",
331
+ "0 filtering-baseline-2019-18-40gt 5 0 0.330953 \n",
332
+ "1 filtering-baseline-2019-18-40gt 5 1000 0.357474 \n",
333
+ "2 filtering-baseline-2019-18-40gt 5 2000 0.377436 \n",
334
+ "3 filtering-baseline-2019-18-40gt 5 3000 0.387994 \n",
335
+ "4 filtering-baseline-2019-18-40gt 5 4000 0.396110 \n",
336
+ ".. ... ... ... ... \n",
337
+ "129 filtering-custom-short-line-ratio-0.67 6 10000 0.422300 \n",
338
+ "130 filtering-custom-short-line-ratio-0.67 6 11000 0.425840 \n",
339
+ "131 filtering-custom-short-line-ratio-0.67 6 12000 0.427343 \n",
340
+ "132 filtering-custom-short-line-ratio-0.67 6 13000 0.429031 \n",
341
+ "133 filtering-custom-short-line-ratio-0.67 6 13500 0.428488 \n",
342
+ "\n",
343
+ " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n",
344
+ "0 0.186 0.233 0.272 \n",
345
+ "1 0.239 0.271 0.297 \n",
346
+ "2 0.280 0.284 0.321 \n",
347
+ "3 0.277 0.291 0.339 \n",
348
+ "4 0.299 0.315 0.340 \n",
349
+ ".. ... ... ... \n",
350
+ "129 0.333 0.341 0.382 \n",
351
+ "130 0.345 0.340 0.395 \n",
352
+ "131 0.339 0.348 0.397 \n",
353
+ "132 0.338 0.338 0.398 \n",
354
+ "133 0.346 0.340 0.398 \n",
355
+ "\n",
356
+ " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n",
357
+ "0 0.258 0.166 0.286 ... 0.367 \n",
358
+ "1 0.287 0.146 0.260 ... 0.365 \n",
359
+ "2 0.332 0.134 0.268 ... 0.368 \n",
360
+ "3 0.359 0.132 0.280 ... 0.394 \n",
361
+ "4 0.366 0.158 0.286 ... 0.376 \n",
362
+ ".. ... ... ... ... ... \n",
363
+ "129 0.417 0.192 0.318 ... 0.389 \n",
364
+ "130 0.432 0.192 0.322 ... 0.379 \n",
365
+ "131 0.439 0.198 0.316 ... 0.382 \n",
366
+ "132 0.449 0.194 0.326 ... 0.384 \n",
367
+ "133 0.447 0.188 0.332 ... 0.382 \n",
368
+ "\n",
369
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
370
+ "0 0.362 0.516 0.497 0.210 \n",
371
+ "1 0.396 0.503 0.486 0.568 \n",
372
+ "2 0.399 0.519 0.502 0.686 \n",
373
+ "3 0.404 0.520 0.503 0.721 \n",
374
+ "4 0.399 0.515 0.500 0.739 \n",
375
+ ".. ... ... ... ... \n",
376
+ "129 0.407 0.536 0.530 NaN \n",
377
+ "130 0.405 0.527 0.531 NaN \n",
378
+ "131 0.402 0.535 0.536 NaN \n",
379
+ "132 0.406 0.539 0.534 NaN \n",
380
+ "133 0.404 0.527 0.527 NaN \n",
381
+ "\n",
382
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
383
+ "0 0.202 0.2190 0.2515 0.230285 0.250127 \n",
384
+ "1 0.502 0.2665 0.2855 0.242526 0.253291 \n",
385
+ "2 0.590 0.3030 0.3215 0.245745 0.260988 \n",
386
+ "3 0.622 0.3210 0.3385 0.250427 0.264451 \n",
387
+ "4 0.620 0.3320 0.3445 0.256134 0.270382 \n",
388
+ ".. ... ... ... ... ... \n",
389
+ "129 NaN 0.3630 0.3700 0.266752 0.284400 \n",
390
+ "130 NaN 0.3680 0.3745 0.267998 0.282222 \n",
391
+ "131 NaN 0.3705 0.3795 0.268891 0.283246 \n",
392
+ "132 NaN 0.3655 0.3775 0.271709 0.282748 \n",
393
+ "133 NaN 0.3720 0.3730 0.272315 0.283901 \n",
394
+ "\n",
395
+ "[134 rows x 22 columns]"
396
+ ]
397
+ },
398
+ "execution_count": 6,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "import pandas as pd\n",
405
+ "from matplotlib.figure import Figure\n",
406
+ "\n",
407
+ "df = pd.read_csv(\"../src_data/custom_filters.csv\")\n",
408
+ "df"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 7,
414
+ "id": "28e61084",
415
+ "metadata": {},
416
+ "outputs": [],
417
+ "source": [
418
+ "runs_mapping = {\n",
419
+ " \"filtering-baseline-2019-18-40gt\": \"Baseline\",\n",
420
+ " \"filtering-custom-line-char-duplicated-v2-0.01\": \"Line duplicates filter\",\n",
421
+ " \"filtering-custom-lines-punc-0.12\": \"Punctuation filter\",\n",
422
+ " \"filtering-custom-short-line-ratio-0.67\": \"Short lines filter\",\n",
423
+ " \"filtering-custom-punc0.12-short-lines0.67-line_char_dup0.1\": \"Filters combined\",\n",
424
+ "}\n",
425
+ "\n"
426
+ ]
427
+ },
428
+ {
429
+ "cell_type": "code",
430
+ "execution_count": 11,
431
+ "id": "af28ebbd054cdc33",
432
+ "metadata": {
433
+ "ExecuteTime": {
434
+ "end_time": "2024-05-04T22:25:33.206952Z",
435
+ "start_time": "2024-05-04T22:25:33.205262Z"
436
+ },
437
+ "collapsed": false
438
+ },
439
+ "outputs": [],
440
+ "source": [
441
+ "\n",
442
+ "from collections import defaultdict\n",
443
+ "import json\n",
444
+ "import os\n",
445
+ "from matplotlib import pyplot as plt\n",
446
+ "import orjson\n",
447
+ "\n",
448
+ "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
449
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
450
+ "\n",
451
+ "def normalize_runname(runname):\n",
452
+ " return runname.replace(\"/\", \"_\")\n",
453
+ "\n",
454
+ "grouped = (\n",
455
+ " df.groupby([\"runname\", \"steps\"])\n",
456
+ " .agg(\n",
457
+ " {\n",
458
+ " key: \"mean\" for key in metrics\n",
459
+ " }\n",
460
+ " )\n",
461
+ " .reset_index()\n",
462
+ ")\n",
463
+ "\n",
464
+ "file_id=\"../assets/data/plots/custom_filters\"\n",
465
+ "files = {}\n",
466
+ "for metric in metrics:\n",
467
+ " datas = {}\n",
468
+ " for name, group in grouped.groupby(\"runname\"):\n",
469
+ " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n",
470
+ " group = group.set_index(\"steps\")\n",
471
+ " rolling_avg = group\n",
472
+ " # rolling_avg = group.rolling(window=5).mean()\n",
473
+ " datas[name] = {\n",
474
+ " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n",
475
+ " \"y\": rolling_avg[metric].tolist(),\n",
476
+ " \"label\": runs_mapping[name],\n",
477
+ " }\n",
478
+ " # Sort the datata based on the steps\n",
479
+ " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n",
480
+ " # Create a folder\n",
481
+ " os.makedirs(f\"{file_id}\", exist_ok=True)\n",
482
+ " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n",
483
+ " json.dump({\n",
484
+ " \"data\": datas,\n",
485
+ " \"layout\": {\n",
486
+ " \"title\": {\n",
487
+ " \"text\": \"Custom filters Performance\"\n",
488
+ " },\n",
489
+ " }\n",
490
+ " }, f)\n",
491
+ " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n",
492
+ "# Create index\n",
493
+ "with open(f\"{file_id}/index.json\", \"w\") as f:\n",
494
+ " json.dump({\n",
495
+ " \"files\": files,\n",
496
+ " \"settings\": {\n",
497
+ " \"defaultMetric\": \"agg_score\",\n",
498
+ " \"slider\":{\"min\":0,\"max\":10,\"default\":3}\n",
499
+ " }\n",
500
+ " }, f)\n",
501
+ " "
502
+ ]
503
+ },
504
+ {
505
+ "cell_type": "code",
506
+ "execution_count": null,
507
+ "id": "80a14409",
508
+ "metadata": {},
509
+ "outputs": [],
510
+ "source": []
511
+ }
512
+ ],
513
+ "metadata": {
514
+ "kernelspec": {
515
+ "display_name": "Python 3",
516
+ "language": "python",
517
+ "name": "python3"
518
+ },
519
+ "language_info": {
520
+ "codemirror_mode": {
521
+ "name": "ipython",
522
+ "version": 3
523
+ },
524
+ "file_extension": ".py",
525
+ "mimetype": "text/x-python",
526
+ "name": "python",
527
+ "nbconvert_exporter": "python",
528
+ "pygments_lexer": "ipython3",
529
+ "version": "3.12.2"
530
+ }
531
+ },
532
+ "nbformat": 4,
533
+ "nbformat_minor": 5
534
+ }
notebooks/plot_dataset_ablations.ipynb ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "138889b92720ce2e",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2024-05-14T09:06:04.487186Z",
10
+ "start_time": "2024-05-14T09:06:04.255111Z"
11
+ },
12
+ "collapsed": false
13
+ },
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/html": [
18
+ "<div>\n",
19
+ "<style scoped>\n",
20
+ " .dataframe tbody tr th:only-of-type {\n",
21
+ " vertical-align: middle;\n",
22
+ " }\n",
23
+ "\n",
24
+ " .dataframe tbody tr th {\n",
25
+ " vertical-align: top;\n",
26
+ " }\n",
27
+ "\n",
28
+ " .dataframe thead th {\n",
29
+ " text-align: right;\n",
30
+ " }\n",
31
+ "</style>\n",
32
+ "<table border=\"1\" class=\"dataframe\">\n",
33
+ " <thead>\n",
34
+ " <tr style=\"text-align: right;\">\n",
35
+ " <th></th>\n",
36
+ " <th>runname</th>\n",
37
+ " <th>steps</th>\n",
38
+ " <th>agg_score</th>\n",
39
+ " <th>commonsense_qa/acc</th>\n",
40
+ " <th>commonsense_qa/acc_norm</th>\n",
41
+ " <th>hellaswag/acc</th>\n",
42
+ " <th>hellaswag/acc_norm</th>\n",
43
+ " <th>openbookqa/acc</th>\n",
44
+ " <th>openbookqa/acc_norm</th>\n",
45
+ " <th>piqa/acc</th>\n",
46
+ " <th>...</th>\n",
47
+ " <th>siqa/acc</th>\n",
48
+ " <th>siqa/acc_norm</th>\n",
49
+ " <th>winogrande/acc</th>\n",
50
+ " <th>winogrande/acc_norm</th>\n",
51
+ " <th>sciq/acc</th>\n",
52
+ " <th>sciq/acc_norm</th>\n",
53
+ " <th>arc/acc</th>\n",
54
+ " <th>arc/acc_norm</th>\n",
55
+ " <th>mmlu/acc</th>\n",
56
+ " <th>mmlu/acc_norm</th>\n",
57
+ " </tr>\n",
58
+ " </thead>\n",
59
+ " <tbody>\n",
60
+ " <tr>\n",
61
+ " <th>0</th>\n",
62
+ " <td>C4</td>\n",
63
+ " <td>0</td>\n",
64
+ " <td>0.330893</td>\n",
65
+ " <td>0.186</td>\n",
66
+ " <td>0.233</td>\n",
67
+ " <td>0.272</td>\n",
68
+ " <td>0.258</td>\n",
69
+ " <td>0.166</td>\n",
70
+ " <td>0.286</td>\n",
71
+ " <td>0.542</td>\n",
72
+ " <td>...</td>\n",
73
+ " <td>0.367</td>\n",
74
+ " <td>0.362</td>\n",
75
+ " <td>0.516</td>\n",
76
+ " <td>0.497</td>\n",
77
+ " <td>0.208</td>\n",
78
+ " <td>0.202</td>\n",
79
+ " <td>0.2195</td>\n",
80
+ " <td>0.2510</td>\n",
81
+ " <td>0.230294</td>\n",
82
+ " <td>0.250147</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>1</th>\n",
86
+ " <td>C4</td>\n",
87
+ " <td>1000</td>\n",
88
+ " <td>0.355112</td>\n",
89
+ " <td>0.229</td>\n",
90
+ " <td>0.260</td>\n",
91
+ " <td>0.286</td>\n",
92
+ " <td>0.288</td>\n",
93
+ " <td>0.128</td>\n",
94
+ " <td>0.250</td>\n",
95
+ " <td>0.614</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>0.351</td>\n",
98
+ " <td>0.404</td>\n",
99
+ " <td>0.519</td>\n",
100
+ " <td>0.476</td>\n",
101
+ " <td>0.565</td>\n",
102
+ " <td>0.518</td>\n",
103
+ " <td>0.2680</td>\n",
104
+ " <td>0.2935</td>\n",
105
+ " <td>0.238951</td>\n",
106
+ " <td>0.250399</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>C4</td>\n",
111
+ " <td>2000</td>\n",
112
+ " <td>0.378435</td>\n",
113
+ " <td>0.268</td>\n",
114
+ " <td>0.278</td>\n",
115
+ " <td>0.312</td>\n",
116
+ " <td>0.330</td>\n",
117
+ " <td>0.122</td>\n",
118
+ " <td>0.276</td>\n",
119
+ " <td>0.646</td>\n",
120
+ " <td>...</td>\n",
121
+ " <td>0.375</td>\n",
122
+ " <td>0.400</td>\n",
123
+ " <td>0.509</td>\n",
124
+ " <td>0.500</td>\n",
125
+ " <td>0.676</td>\n",
126
+ " <td>0.577</td>\n",
127
+ " <td>0.3065</td>\n",
128
+ " <td>0.3230</td>\n",
129
+ " <td>0.247275</td>\n",
130
+ " <td>0.255482</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>3</th>\n",
134
+ " <td>C4</td>\n",
135
+ " <td>3000</td>\n",
136
+ " <td>0.387795</td>\n",
137
+ " <td>0.280</td>\n",
138
+ " <td>0.295</td>\n",
139
+ " <td>0.331</td>\n",
140
+ " <td>0.380</td>\n",
141
+ " <td>0.152</td>\n",
142
+ " <td>0.274</td>\n",
143
+ " <td>0.660</td>\n",
144
+ " <td>...</td>\n",
145
+ " <td>0.376</td>\n",
146
+ " <td>0.387</td>\n",
147
+ " <td>0.512</td>\n",
148
+ " <td>0.496</td>\n",
149
+ " <td>0.725</td>\n",
150
+ " <td>0.621</td>\n",
151
+ " <td>0.3175</td>\n",
152
+ " <td>0.3340</td>\n",
153
+ " <td>0.254534</td>\n",
154
+ " <td>0.267363</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>4</th>\n",
158
+ " <td>C4</td>\n",
159
+ " <td>4000</td>\n",
160
+ " <td>0.399320</td>\n",
161
+ " <td>0.296</td>\n",
162
+ " <td>0.298</td>\n",
163
+ " <td>0.351</td>\n",
164
+ " <td>0.406</td>\n",
165
+ " <td>0.168</td>\n",
166
+ " <td>0.282</td>\n",
167
+ " <td>0.676</td>\n",
168
+ " <td>...</td>\n",
169
+ " <td>0.382</td>\n",
170
+ " <td>0.404</td>\n",
171
+ " <td>0.522</td>\n",
172
+ " <td>0.503</td>\n",
173
+ " <td>0.723</td>\n",
174
+ " <td>0.618</td>\n",
175
+ " <td>0.3255</td>\n",
176
+ " <td>0.3470</td>\n",
177
+ " <td>0.254762</td>\n",
178
+ " <td>0.263563</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>...</th>\n",
182
+ " <td>...</td>\n",
183
+ " <td>...</td>\n",
184
+ " <td>...</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " <td>...</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>...</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " </tr>\n",
204
+ " <tr>\n",
205
+ " <th>1171</th>\n",
206
+ " <td>The Pile</td>\n",
207
+ " <td>163000</td>\n",
208
+ " <td>0.463789</td>\n",
209
+ " <td>0.379</td>\n",
210
+ " <td>0.349</td>\n",
211
+ " <td>0.441</td>\n",
212
+ " <td>0.555</td>\n",
213
+ " <td>0.240</td>\n",
214
+ " <td>0.366</td>\n",
215
+ " <td>0.701</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>0.405</td>\n",
218
+ " <td>0.388</td>\n",
219
+ " <td>0.585</td>\n",
220
+ " <td>0.560</td>\n",
221
+ " <td>0.875</td>\n",
222
+ " <td>0.820</td>\n",
223
+ " <td>0.4475</td>\n",
224
+ " <td>0.4450</td>\n",
225
+ " <td>0.299378</td>\n",
226
+ " <td>0.326313</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>1172</th>\n",
230
+ " <td>The Pile</td>\n",
231
+ " <td>164000</td>\n",
232
+ " <td>0.462758</td>\n",
233
+ " <td>0.369</td>\n",
234
+ " <td>0.344</td>\n",
235
+ " <td>0.438</td>\n",
236
+ " <td>0.552</td>\n",
237
+ " <td>0.248</td>\n",
238
+ " <td>0.348</td>\n",
239
+ " <td>0.708</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>0.395</td>\n",
242
+ " <td>0.401</td>\n",
243
+ " <td>0.577</td>\n",
244
+ " <td>0.567</td>\n",
245
+ " <td>0.874</td>\n",
246
+ " <td>0.806</td>\n",
247
+ " <td>0.4465</td>\n",
248
+ " <td>0.4355</td>\n",
249
+ " <td>0.302083</td>\n",
250
+ " <td>0.331563</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>1173</th>\n",
254
+ " <td>The Pile</td>\n",
255
+ " <td>165000</td>\n",
256
+ " <td>0.465026</td>\n",
257
+ " <td>0.383</td>\n",
258
+ " <td>0.350</td>\n",
259
+ " <td>0.438</td>\n",
260
+ " <td>0.553</td>\n",
261
+ " <td>0.234</td>\n",
262
+ " <td>0.352</td>\n",
263
+ " <td>0.707</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>0.400</td>\n",
266
+ " <td>0.401</td>\n",
267
+ " <td>0.569</td>\n",
268
+ " <td>0.556</td>\n",
269
+ " <td>0.874</td>\n",
270
+ " <td>0.811</td>\n",
271
+ " <td>0.4460</td>\n",
272
+ " <td>0.4455</td>\n",
273
+ " <td>0.305193</td>\n",
274
+ " <td>0.331708</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>1174</th>\n",
278
+ " <td>The Pile</td>\n",
279
+ " <td>166000</td>\n",
280
+ " <td>0.462349</td>\n",
281
+ " <td>0.377</td>\n",
282
+ " <td>0.346</td>\n",
283
+ " <td>0.440</td>\n",
284
+ " <td>0.557</td>\n",
285
+ " <td>0.228</td>\n",
286
+ " <td>0.346</td>\n",
287
+ " <td>0.711</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>0.398</td>\n",
290
+ " <td>0.398</td>\n",
291
+ " <td>0.572</td>\n",
292
+ " <td>0.558</td>\n",
293
+ " <td>0.877</td>\n",
294
+ " <td>0.811</td>\n",
295
+ " <td>0.4525</td>\n",
296
+ " <td>0.4385</td>\n",
297
+ " <td>0.301952</td>\n",
298
+ " <td>0.331295</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>1175</th>\n",
302
+ " <td>The Pile</td>\n",
303
+ " <td>167000</td>\n",
304
+ " <td>0.464539</td>\n",
305
+ " <td>0.386</td>\n",
306
+ " <td>0.354</td>\n",
307
+ " <td>0.434</td>\n",
308
+ " <td>0.557</td>\n",
309
+ " <td>0.232</td>\n",
310
+ " <td>0.356</td>\n",
311
+ " <td>0.706</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>0.402</td>\n",
314
+ " <td>0.402</td>\n",
315
+ " <td>0.573</td>\n",
316
+ " <td>0.559</td>\n",
317
+ " <td>0.867</td>\n",
318
+ " <td>0.802</td>\n",
319
+ " <td>0.4475</td>\n",
320
+ " <td>0.4375</td>\n",
321
+ " <td>0.301934</td>\n",
322
+ " <td>0.330810</td>\n",
323
+ " </tr>\n",
324
+ " </tbody>\n",
325
+ "</table>\n",
326
+ "<p>1176 rows × 21 columns</p>\n",
327
+ "</div>"
328
+ ],
329
+ "text/plain": [
330
+ " runname steps agg_score commonsense_qa/acc \\\n",
331
+ "0 C4 0 0.330893 0.186 \n",
332
+ "1 C4 1000 0.355112 0.229 \n",
333
+ "2 C4 2000 0.378435 0.268 \n",
334
+ "3 C4 3000 0.387795 0.280 \n",
335
+ "4 C4 4000 0.399320 0.296 \n",
336
+ "... ... ... ... ... \n",
337
+ "1171 The Pile 163000 0.463789 0.379 \n",
338
+ "1172 The Pile 164000 0.462758 0.369 \n",
339
+ "1173 The Pile 165000 0.465026 0.383 \n",
340
+ "1174 The Pile 166000 0.462349 0.377 \n",
341
+ "1175 The Pile 167000 0.464539 0.386 \n",
342
+ "\n",
343
+ " commonsense_qa/acc_norm hellaswag/acc hellaswag/acc_norm \\\n",
344
+ "0 0.233 0.272 0.258 \n",
345
+ "1 0.260 0.286 0.288 \n",
346
+ "2 0.278 0.312 0.330 \n",
347
+ "3 0.295 0.331 0.380 \n",
348
+ "4 0.298 0.351 0.406 \n",
349
+ "... ... ... ... \n",
350
+ "1171 0.349 0.441 0.555 \n",
351
+ "1172 0.344 0.438 0.552 \n",
352
+ "1173 0.350 0.438 0.553 \n",
353
+ "1174 0.346 0.440 0.557 \n",
354
+ "1175 0.354 0.434 0.557 \n",
355
+ "\n",
356
+ " openbookqa/acc openbookqa/acc_norm piqa/acc ... siqa/acc \\\n",
357
+ "0 0.166 0.286 0.542 ... 0.367 \n",
358
+ "1 0.128 0.250 0.614 ... 0.351 \n",
359
+ "2 0.122 0.276 0.646 ... 0.375 \n",
360
+ "3 0.152 0.274 0.660 ... 0.376 \n",
361
+ "4 0.168 0.282 0.676 ... 0.382 \n",
362
+ "... ... ... ... ... ... \n",
363
+ "1171 0.240 0.366 0.701 ... 0.405 \n",
364
+ "1172 0.248 0.348 0.708 ... 0.395 \n",
365
+ "1173 0.234 0.352 0.707 ... 0.400 \n",
366
+ "1174 0.228 0.346 0.711 ... 0.398 \n",
367
+ "1175 0.232 0.356 0.706 ... 0.402 \n",
368
+ "\n",
369
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
370
+ "0 0.362 0.516 0.497 0.208 \n",
371
+ "1 0.404 0.519 0.476 0.565 \n",
372
+ "2 0.400 0.509 0.500 0.676 \n",
373
+ "3 0.387 0.512 0.496 0.725 \n",
374
+ "4 0.404 0.522 0.503 0.723 \n",
375
+ "... ... ... ... ... \n",
376
+ "1171 0.388 0.585 0.560 0.875 \n",
377
+ "1172 0.401 0.577 0.567 0.874 \n",
378
+ "1173 0.401 0.569 0.556 0.874 \n",
379
+ "1174 0.398 0.572 0.558 0.877 \n",
380
+ "1175 0.402 0.573 0.559 0.867 \n",
381
+ "\n",
382
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
383
+ "0 0.202 0.2195 0.2510 0.230294 0.250147 \n",
384
+ "1 0.518 0.2680 0.2935 0.238951 0.250399 \n",
385
+ "2 0.577 0.3065 0.3230 0.247275 0.255482 \n",
386
+ "3 0.621 0.3175 0.3340 0.254534 0.267363 \n",
387
+ "4 0.618 0.3255 0.3470 0.254762 0.263563 \n",
388
+ "... ... ... ... ... ... \n",
389
+ "1171 0.820 0.4475 0.4450 0.299378 0.326313 \n",
390
+ "1172 0.806 0.4465 0.4355 0.302083 0.331563 \n",
391
+ "1173 0.811 0.4460 0.4455 0.305193 0.331708 \n",
392
+ "1174 0.811 0.4525 0.4385 0.301952 0.331295 \n",
393
+ "1175 0.802 0.4475 0.4375 0.301934 0.330810 \n",
394
+ "\n",
395
+ "[1176 rows x 21 columns]"
396
+ ]
397
+ },
398
+ "execution_count": 4,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "import pandas as pd\n",
405
+ "from matplotlib.figure import Figure\n",
406
+ "\n",
407
+ "df = pd.read_csv(\"../src_data/eval_results.csv\")\n",
408
+ "df"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 2,
414
+ "id": "b610f43caefdf01",
415
+ "metadata": {
416
+ "ExecuteTime": {
417
+ "end_time": "2024-05-14T09:06:04.563945Z",
418
+ "start_time": "2024-05-14T09:06:04.562142Z"
419
+ },
420
+ "collapsed": false
421
+ },
422
+ "outputs": [],
423
+ "source": []
424
+ },
425
+ {
426
+ "cell_type": "code",
427
+ "execution_count": 5,
428
+ "id": "initial_id",
429
+ "metadata": {
430
+ "ExecuteTime": {
431
+ "end_time": "2024-05-14T09:06:37.927921Z",
432
+ "start_time": "2024-05-14T09:06:37.588025Z"
433
+ },
434
+ "collapsed": true
435
+ },
436
+ "outputs": [],
437
+ "source": [
438
+ "import json\n",
439
+ "import os\n",
440
+ "from matplotlib import pyplot as plt\n",
441
+ "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
442
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
443
+ "\n",
444
+ "def normalize_runname(runname):\n",
445
+ " return runname.replace(\"/\", \"_\")\n",
446
+ "\n",
447
+ "grouped = (\n",
448
+ " df.groupby([\"runname\", \"steps\"])\n",
449
+ " .agg(\n",
450
+ " {\n",
451
+ " key: \"mean\" for key in metrics\n",
452
+ " }\n",
453
+ " )\n",
454
+ " .reset_index()\n",
455
+ ")\n",
456
+ "\n",
457
+ "file_id=\"../assets/data/plots/dataset_ablations\"\n",
458
+ "files = {}\n",
459
+ "for metric in metrics:\n",
460
+ " datas = {}\n",
461
+ " for name, group in grouped.groupby(\"runname\"):\n",
462
+ " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n",
463
+ " group = group.set_index(\"steps\")\n",
464
+ " rolling_avg = group\n",
465
+ " # rolling_avg = group.rolling(window=5).mean()\n",
466
+ " datas[name] = {\n",
467
+ " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n",
468
+ " \"y\": rolling_avg[metric].tolist(),\n",
469
+ " \"label\": name,\n",
470
+ " }\n",
471
+ " # Sort the datata based on the steps\n",
472
+ " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n",
473
+ " # Create a folder\n",
474
+ " os.makedirs(f\"{file_id}\", exist_ok=True)\n",
475
+ " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n",
476
+ " json.dump({\n",
477
+ " \"data\": datas,\n",
478
+ " \"layout\": {\n",
479
+ " \"title\": {\n",
480
+ " \"text\": \"Dataset ablations\"\n",
481
+ " },\n",
482
+ " }\n",
483
+ " }, f)\n",
484
+ " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n",
485
+ "# Create index\n",
486
+ "with open(f\"{file_id}/index.json\", \"w\") as f:\n",
487
+ " json.dump({\n",
488
+ " \"files\": files,\n",
489
+ " \"settings\": {\n",
490
+ " \"defaultMetric\": \"agg_score\",\n",
491
+ " \"slider\":{\"min\":0,\"max\":30,\"default\":5}\n",
492
+ " }\n",
493
+ " }, f)\n",
494
+ " "
495
+ ]
496
+ },
497
+ {
498
+ "cell_type": "code",
499
+ "execution_count": 7,
500
+ "id": "af28ebbd054cdc33",
501
+ "metadata": {
502
+ "ExecuteTime": {
503
+ "end_time": "2024-05-04T22:25:33.206952Z",
504
+ "start_time": "2024-05-04T22:25:33.205262Z"
505
+ },
506
+ "collapsed": false
507
+ },
508
+ "outputs": [],
509
+ "source": []
510
+ }
511
+ ],
512
+ "metadata": {
513
+ "kernelspec": {
514
+ "display_name": "Python 3",
515
+ "language": "python",
516
+ "name": "python3"
517
+ },
518
+ "language_info": {
519
+ "codemirror_mode": {
520
+ "name": "ipython",
521
+ "version": 3
522
+ },
523
+ "file_extension": ".py",
524
+ "mimetype": "text/x-python",
525
+ "name": "python",
526
+ "nbconvert_exporter": "python",
527
+ "pygments_lexer": "ipython3",
528
+ "version": "3.12.2"
529
+ }
530
+ },
531
+ "nbformat": 4,
532
+ "nbformat_minor": 5
533
+ }
notebooks/plot_dedup_all_dumps_bad.ipynb ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "138889b92720ce2e",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2024-04-30T15:07:36.238754Z",
10
+ "start_time": "2024-04-30T15:07:35.974657Z"
11
+ },
12
+ "collapsed": false
13
+ },
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/html": [
18
+ "<div>\n",
19
+ "<style scoped>\n",
20
+ " .dataframe tbody tr th:only-of-type {\n",
21
+ " vertical-align: middle;\n",
22
+ " }\n",
23
+ "\n",
24
+ " .dataframe tbody tr th {\n",
25
+ " vertical-align: top;\n",
26
+ " }\n",
27
+ "\n",
28
+ " .dataframe thead th {\n",
29
+ " text-align: right;\n",
30
+ " }\n",
31
+ "</style>\n",
32
+ "<table border=\"1\" class=\"dataframe\">\n",
33
+ " <thead>\n",
34
+ " <tr style=\"text-align: right;\">\n",
35
+ " <th></th>\n",
36
+ " <th>runname</th>\n",
37
+ " <th>seed</th>\n",
38
+ " <th>steps</th>\n",
39
+ " <th>agg_score</th>\n",
40
+ " <th>commonsense_qa/acc</th>\n",
41
+ " <th>commonsense_qa/acc_norm</th>\n",
42
+ " <th>hellaswag/acc</th>\n",
43
+ " <th>hellaswag/acc_norm</th>\n",
44
+ " <th>openbookqa/acc</th>\n",
45
+ " <th>openbookqa/acc_norm</th>\n",
46
+ " <th>...</th>\n",
47
+ " <th>siqa/acc</th>\n",
48
+ " <th>siqa/acc_norm</th>\n",
49
+ " <th>winogrande/acc</th>\n",
50
+ " <th>winogrande/acc_norm</th>\n",
51
+ " <th>sciq/acc</th>\n",
52
+ " <th>sciq/acc_norm</th>\n",
53
+ " <th>arc/acc</th>\n",
54
+ " <th>arc/acc_norm</th>\n",
55
+ " <th>mmlu/acc</th>\n",
56
+ " <th>mmlu/acc_norm</th>\n",
57
+ " </tr>\n",
58
+ " </thead>\n",
59
+ " <tbody>\n",
60
+ " <tr>\n",
61
+ " <th>0</th>\n",
62
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
63
+ " <td>6</td>\n",
64
+ " <td>0</td>\n",
65
+ " <td>0.330893</td>\n",
66
+ " <td>0.186</td>\n",
67
+ " <td>0.233</td>\n",
68
+ " <td>0.272</td>\n",
69
+ " <td>0.258</td>\n",
70
+ " <td>0.166</td>\n",
71
+ " <td>0.286</td>\n",
72
+ " <td>...</td>\n",
73
+ " <td>0.367</td>\n",
74
+ " <td>0.362</td>\n",
75
+ " <td>0.516</td>\n",
76
+ " <td>0.497</td>\n",
77
+ " <td>0.209</td>\n",
78
+ " <td>0.202</td>\n",
79
+ " <td>0.2195</td>\n",
80
+ " <td>0.2510</td>\n",
81
+ " <td>0.230294</td>\n",
82
+ " <td>0.250147</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>1</th>\n",
86
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
87
+ " <td>6</td>\n",
88
+ " <td>1000</td>\n",
89
+ " <td>0.360520</td>\n",
90
+ " <td>0.254</td>\n",
91
+ " <td>0.260</td>\n",
92
+ " <td>0.290</td>\n",
93
+ " <td>0.281</td>\n",
94
+ " <td>0.138</td>\n",
95
+ " <td>0.256</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>0.362</td>\n",
98
+ " <td>0.400</td>\n",
99
+ " <td>0.517</td>\n",
100
+ " <td>0.524</td>\n",
101
+ " <td>0.573</td>\n",
102
+ " <td>0.515</td>\n",
103
+ " <td>0.2675</td>\n",
104
+ " <td>0.2895</td>\n",
105
+ " <td>0.239489</td>\n",
106
+ " <td>0.251660</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
111
+ " <td>6</td>\n",
112
+ " <td>2000</td>\n",
113
+ " <td>0.373315</td>\n",
114
+ " <td>0.285</td>\n",
115
+ " <td>0.278</td>\n",
116
+ " <td>0.315</td>\n",
117
+ " <td>0.323</td>\n",
118
+ " <td>0.138</td>\n",
119
+ " <td>0.272</td>\n",
120
+ " <td>...</td>\n",
121
+ " <td>0.365</td>\n",
122
+ " <td>0.395</td>\n",
123
+ " <td>0.509</td>\n",
124
+ " <td>0.490</td>\n",
125
+ " <td>0.677</td>\n",
126
+ " <td>0.596</td>\n",
127
+ " <td>0.3075</td>\n",
128
+ " <td>0.3235</td>\n",
129
+ " <td>0.250318</td>\n",
130
+ " <td>0.261019</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>3</th>\n",
134
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
135
+ " <td>6</td>\n",
136
+ " <td>3000</td>\n",
137
+ " <td>0.388201</td>\n",
138
+ " <td>0.294</td>\n",
139
+ " <td>0.291</td>\n",
140
+ " <td>0.327</td>\n",
141
+ " <td>0.341</td>\n",
142
+ " <td>0.152</td>\n",
143
+ " <td>0.298</td>\n",
144
+ " <td>...</td>\n",
145
+ " <td>0.371</td>\n",
146
+ " <td>0.396</td>\n",
147
+ " <td>0.512</td>\n",
148
+ " <td>0.504</td>\n",
149
+ " <td>0.712</td>\n",
150
+ " <td>0.621</td>\n",
151
+ " <td>0.3220</td>\n",
152
+ " <td>0.3390</td>\n",
153
+ " <td>0.255646</td>\n",
154
+ " <td>0.266605</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>4</th>\n",
158
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
159
+ " <td>6</td>\n",
160
+ " <td>4000</td>\n",
161
+ " <td>0.393412</td>\n",
162
+ " <td>0.306</td>\n",
163
+ " <td>0.307</td>\n",
164
+ " <td>0.337</td>\n",
165
+ " <td>0.360</td>\n",
166
+ " <td>0.172</td>\n",
167
+ " <td>0.284</td>\n",
168
+ " <td>...</td>\n",
169
+ " <td>0.380</td>\n",
170
+ " <td>0.402</td>\n",
171
+ " <td>0.522</td>\n",
172
+ " <td>0.510</td>\n",
173
+ " <td>0.729</td>\n",
174
+ " <td>0.612</td>\n",
175
+ " <td>0.3100</td>\n",
176
+ " <td>0.3385</td>\n",
177
+ " <td>0.253048</td>\n",
178
+ " <td>0.266798</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>...</th>\n",
182
+ " <td>...</td>\n",
183
+ " <td>...</td>\n",
184
+ " <td>...</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " <td>...</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>...</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " </tr>\n",
204
+ " <tr>\n",
205
+ " <th>501</th>\n",
206
+ " <td>big-run-fineweb-cross-dedup-fixed</td>\n",
207
+ " <td>6</td>\n",
208
+ " <td>163000</td>\n",
209
+ " <td>0.466306</td>\n",
210
+ " <td>0.391</td>\n",
211
+ " <td>0.371</td>\n",
212
+ " <td>0.459</td>\n",
213
+ " <td>0.547</td>\n",
214
+ " <td>0.210</td>\n",
215
+ " <td>0.344</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>0.401</td>\n",
218
+ " <td>0.388</td>\n",
219
+ " <td>0.564</td>\n",
220
+ " <td>0.562</td>\n",
221
+ " <td>0.884</td>\n",
222
+ " <td>0.807</td>\n",
223
+ " <td>0.4535</td>\n",
224
+ " <td>0.4450</td>\n",
225
+ " <td>0.300475</td>\n",
226
+ " <td>0.320448</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>502</th>\n",
230
+ " <td>big-run-fineweb-cross-dedup-fixed</td>\n",
231
+ " <td>6</td>\n",
232
+ " <td>164000</td>\n",
233
+ " <td>0.468313</td>\n",
234
+ " <td>0.395</td>\n",
235
+ " <td>0.374</td>\n",
236
+ " <td>0.459</td>\n",
237
+ " <td>0.548</td>\n",
238
+ " <td>0.208</td>\n",
239
+ " <td>0.350</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>0.402</td>\n",
242
+ " <td>0.395</td>\n",
243
+ " <td>0.559</td>\n",
244
+ " <td>0.561</td>\n",
245
+ " <td>0.876</td>\n",
246
+ " <td>0.795</td>\n",
247
+ " <td>0.4540</td>\n",
248
+ " <td>0.4445</td>\n",
249
+ " <td>0.299279</td>\n",
250
+ " <td>0.321007</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>503</th>\n",
254
+ " <td>big-run-fineweb-cross-dedup-fixed</td>\n",
255
+ " <td>6</td>\n",
256
+ " <td>165000</td>\n",
257
+ " <td>0.468639</td>\n",
258
+ " <td>0.397</td>\n",
259
+ " <td>0.374</td>\n",
260
+ " <td>0.450</td>\n",
261
+ " <td>0.548</td>\n",
262
+ " <td>0.208</td>\n",
263
+ " <td>0.358</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>0.400</td>\n",
266
+ " <td>0.391</td>\n",
267
+ " <td>0.552</td>\n",
268
+ " <td>0.556</td>\n",
269
+ " <td>0.876</td>\n",
270
+ " <td>0.787</td>\n",
271
+ " <td>0.4490</td>\n",
272
+ " <td>0.4420</td>\n",
273
+ " <td>0.298460</td>\n",
274
+ " <td>0.319108</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>504</th>\n",
278
+ " <td>big-run-fineweb-cross-dedup-fixed</td>\n",
279
+ " <td>6</td>\n",
280
+ " <td>166000</td>\n",
281
+ " <td>0.465767</td>\n",
282
+ " <td>0.412</td>\n",
283
+ " <td>0.375</td>\n",
284
+ " <td>0.458</td>\n",
285
+ " <td>0.552</td>\n",
286
+ " <td>0.214</td>\n",
287
+ " <td>0.348</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>0.403</td>\n",
290
+ " <td>0.398</td>\n",
291
+ " <td>0.551</td>\n",
292
+ " <td>0.553</td>\n",
293
+ " <td>0.877</td>\n",
294
+ " <td>0.802</td>\n",
295
+ " <td>0.4465</td>\n",
296
+ " <td>0.4345</td>\n",
297
+ " <td>0.298333</td>\n",
298
+ " <td>0.318637</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>505</th>\n",
302
+ " <td>big-run-fineweb-cross-dedup-fixed</td>\n",
303
+ " <td>6</td>\n",
304
+ " <td>167000</td>\n",
305
+ " <td>0.469262</td>\n",
306
+ " <td>0.399</td>\n",
307
+ " <td>0.377</td>\n",
308
+ " <td>0.459</td>\n",
309
+ " <td>0.550</td>\n",
310
+ " <td>0.220</td>\n",
311
+ " <td>0.348</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>0.406</td>\n",
314
+ " <td>0.401</td>\n",
315
+ " <td>0.564</td>\n",
316
+ " <td>0.560</td>\n",
317
+ " <td>0.882</td>\n",
318
+ " <td>0.798</td>\n",
319
+ " <td>0.4480</td>\n",
320
+ " <td>0.4405</td>\n",
321
+ " <td>0.297617</td>\n",
322
+ " <td>0.319592</td>\n",
323
+ " </tr>\n",
324
+ " </tbody>\n",
325
+ "</table>\n",
326
+ "<p>506 rows × 22 columns</p>\n",
327
+ "</div>"
328
+ ],
329
+ "text/plain": [
330
+ " runname seed steps agg_score \\\n",
331
+ "0 big-run-sampled_full_filtered_no_dedup 6 0 0.330893 \n",
332
+ "1 big-run-sampled_full_filtered_no_dedup 6 1000 0.360520 \n",
333
+ "2 big-run-sampled_full_filtered_no_dedup 6 2000 0.373315 \n",
334
+ "3 big-run-sampled_full_filtered_no_dedup 6 3000 0.388201 \n",
335
+ "4 big-run-sampled_full_filtered_no_dedup 6 4000 0.393412 \n",
336
+ ".. ... ... ... ... \n",
337
+ "501 big-run-fineweb-cross-dedup-fixed 6 163000 0.466306 \n",
338
+ "502 big-run-fineweb-cross-dedup-fixed 6 164000 0.468313 \n",
339
+ "503 big-run-fineweb-cross-dedup-fixed 6 165000 0.468639 \n",
340
+ "504 big-run-fineweb-cross-dedup-fixed 6 166000 0.465767 \n",
341
+ "505 big-run-fineweb-cross-dedup-fixed 6 167000 0.469262 \n",
342
+ "\n",
343
+ " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n",
344
+ "0 0.186 0.233 0.272 \n",
345
+ "1 0.254 0.260 0.290 \n",
346
+ "2 0.285 0.278 0.315 \n",
347
+ "3 0.294 0.291 0.327 \n",
348
+ "4 0.306 0.307 0.337 \n",
349
+ ".. ... ... ... \n",
350
+ "501 0.391 0.371 0.459 \n",
351
+ "502 0.395 0.374 0.459 \n",
352
+ "503 0.397 0.374 0.450 \n",
353
+ "504 0.412 0.375 0.458 \n",
354
+ "505 0.399 0.377 0.459 \n",
355
+ "\n",
356
+ " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n",
357
+ "0 0.258 0.166 0.286 ... 0.367 \n",
358
+ "1 0.281 0.138 0.256 ... 0.362 \n",
359
+ "2 0.323 0.138 0.272 ... 0.365 \n",
360
+ "3 0.341 0.152 0.298 ... 0.371 \n",
361
+ "4 0.360 0.172 0.284 ... 0.380 \n",
362
+ ".. ... ... ... ... ... \n",
363
+ "501 0.547 0.210 0.344 ... 0.401 \n",
364
+ "502 0.548 0.208 0.350 ... 0.402 \n",
365
+ "503 0.548 0.208 0.358 ... 0.400 \n",
366
+ "504 0.552 0.214 0.348 ... 0.403 \n",
367
+ "505 0.550 0.220 0.348 ... 0.406 \n",
368
+ "\n",
369
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
370
+ "0 0.362 0.516 0.497 0.209 \n",
371
+ "1 0.400 0.517 0.524 0.573 \n",
372
+ "2 0.395 0.509 0.490 0.677 \n",
373
+ "3 0.396 0.512 0.504 0.712 \n",
374
+ "4 0.402 0.522 0.510 0.729 \n",
375
+ ".. ... ... ... ... \n",
376
+ "501 0.388 0.564 0.562 0.884 \n",
377
+ "502 0.395 0.559 0.561 0.876 \n",
378
+ "503 0.391 0.552 0.556 0.876 \n",
379
+ "504 0.398 0.551 0.553 0.877 \n",
380
+ "505 0.401 0.564 0.560 0.882 \n",
381
+ "\n",
382
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
383
+ "0 0.202 0.2195 0.2510 0.230294 0.250147 \n",
384
+ "1 0.515 0.2675 0.2895 0.239489 0.251660 \n",
385
+ "2 0.596 0.3075 0.3235 0.250318 0.261019 \n",
386
+ "3 0.621 0.3220 0.3390 0.255646 0.266605 \n",
387
+ "4 0.612 0.3100 0.3385 0.253048 0.266798 \n",
388
+ ".. ... ... ... ... ... \n",
389
+ "501 0.807 0.4535 0.4450 0.300475 0.320448 \n",
390
+ "502 0.795 0.4540 0.4445 0.299279 0.321007 \n",
391
+ "503 0.787 0.4490 0.4420 0.298460 0.319108 \n",
392
+ "504 0.802 0.4465 0.4345 0.298333 0.318637 \n",
393
+ "505 0.798 0.4480 0.4405 0.297617 0.319592 \n",
394
+ "\n",
395
+ "[506 rows x 22 columns]"
396
+ ]
397
+ },
398
+ "execution_count": 1,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "import pandas as pd\n",
405
+ "from matplotlib.figure import Figure\n",
406
+ "\n",
407
+ "df = pd.read_csv(\"../src_data/cross_dedup_refinedweb_filtered.csv\")\n",
408
+ "df"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 13,
414
+ "id": "b610f43caefdf01",
415
+ "metadata": {
416
+ "ExecuteTime": {
417
+ "end_time": "2024-04-30T15:07:36.242016Z",
418
+ "start_time": "2024-04-30T15:07:36.239657Z"
419
+ },
420
+ "collapsed": false
421
+ },
422
+ "outputs": [],
423
+ "source": [
424
+ "runs_mapping = {\n",
425
+ " \"big-run-refinedweb\": \"RefinedWeb\",\n",
426
+ " \"big-run-fineweb-cross-dedup-fixed\": \"FineWeb full MinHash\",\n",
427
+ " \"big-run-sampled_full_filtered_no_dedup\": \"FineWeb filtered only\"\n",
428
+ "}"
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "code",
433
+ "execution_count": 15,
434
+ "id": "initial_id",
435
+ "metadata": {
436
+ "ExecuteTime": {
437
+ "end_time": "2024-04-30T15:07:36.360665Z",
438
+ "start_time": "2024-04-30T15:07:36.242724Z"
439
+ },
440
+ "collapsed": true
441
+ },
442
+ "outputs": [
443
+ {
444
+ "name": "stderr",
445
+ "output_type": "stream",
446
+ "text": [
447
+ "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n"
448
+ ]
449
+ },
450
+ {
451
+ "data": {
452
+ "image/png": "",
453
+ "text/plain": [
454
+ "<Figure size 640x480 with 1 Axes>"
455
+ ]
456
+ },
457
+ "metadata": {},
458
+ "output_type": "display_data"
459
+ }
460
+ ],
461
+ "source": [
462
+ "from matplotlib import pyplot as plt\n",
463
+ "from matplotlib import pyplot as plt\n",
464
+ "\n",
465
+ "import json\n",
466
+ "import os\n",
467
+ "from matplotlib import pyplot as plt\n",
468
+ "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
469
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
470
+ "\n",
471
+ "def normalize_runname(runname):\n",
472
+ " return runname.replace(\"/\", \"_\")\n",
473
+ "\n",
474
+ "grouped = (\n",
475
+ " df.groupby([\"runname\", \"steps\"])\n",
476
+ " .agg(\n",
477
+ " {\n",
478
+ " key: \"mean\" for key in metrics\n",
479
+ " }\n",
480
+ " )\n",
481
+ " .reset_index()\n",
482
+ ")\n",
483
+ "\n",
484
+ "file_id=\"../assets/data/plots/all_dumps_bad\"\n",
485
+ "files = {}\n",
486
+ "for metric in metrics:\n",
487
+ " datas = {}\n",
488
+ " for name, group in grouped.groupby(\"runname\"):\n",
489
+ " # if name not in runs_mapping:\n",
490
+ " # continue\n",
491
+ " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n",
492
+ " group = group.set_index(\"steps\")\n",
493
+ " rolling_avg = group\n",
494
+ " # rolling_avg = group.rolling(window=5).mean()\n",
495
+ " datas[name] = {\n",
496
+ " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n",
497
+ " \"y\": rolling_avg[metric].tolist(),\n",
498
+ " \"label\": runs_mapping[name],\n",
499
+ " }\n",
500
+ " # Sort the datata based on the steps\n",
501
+ " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n",
502
+ " # Create a folder\n",
503
+ " os.makedirs(f\"{file_id}\", exist_ok=True)\n",
504
+ " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n",
505
+ " json.dump({\n",
506
+ " \"data\": datas,\n",
507
+ " \"layout\": {\n",
508
+ " \"title\": {\n",
509
+ " \"text\": \"Dedup across all dumps does not improve performance\"\n",
510
+ " },\n",
511
+ " }\n",
512
+ " }, f)\n",
513
+ " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n",
514
+ "# Create index\n",
515
+ "with open(f\"{file_id}/index.json\", \"w\") as f:\n",
516
+ " json.dump({\n",
517
+ " \"files\": files,\n",
518
+ " \"settings\": {\n",
519
+ " \"defaultMetric\": \"agg_score\",\n",
520
+ " \"slider\":{\"min\":0,\"max\":30,\"default\":5}\n",
521
+ " }\n",
522
+ " }, f)\n",
523
+ "# Add labels and legend\n",
524
+ "plt.xlabel('Training tokens (billions)')\n",
525
+ "plt.ylabel('Agg Score')\n",
526
+ "plt.title('Dedup across all dumps does not improve performance')\n",
527
+ "plt.legend()\n",
528
+ "\n",
529
+ "# Show the plot\n",
530
+ "plt.show()"
531
+ ]
532
+ },
533
+ {
534
+ "cell_type": "code",
535
+ "execution_count": 4,
536
+ "id": "af28ebbd054cdc33",
537
+ "metadata": {
538
+ "ExecuteTime": {
539
+ "end_time": "2024-04-30T15:07:36.363849Z",
540
+ "start_time": "2024-04-30T15:07:36.362222Z"
541
+ },
542
+ "collapsed": false
543
+ },
544
+ "outputs": [],
545
+ "source": []
546
+ }
547
+ ],
548
+ "metadata": {
549
+ "kernelspec": {
550
+ "display_name": "Python 3",
551
+ "language": "python",
552
+ "name": "python3"
553
+ },
554
+ "language_info": {
555
+ "codemirror_mode": {
556
+ "name": "ipython",
557
+ "version": 3
558
+ },
559
+ "file_extension": ".py",
560
+ "mimetype": "text/x-python",
561
+ "name": "python",
562
+ "nbconvert_exporter": "python",
563
+ "pygments_lexer": "ipython3",
564
+ "version": "3.12.2"
565
+ }
566
+ },
567
+ "nbformat": 4,
568
+ "nbformat_minor": 5
569
+ }
notebooks/plot_dedup_attempts.ipynb ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "138889b92720ce2e",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2024-04-30T15:08:02.398435Z",
10
+ "start_time": "2024-04-30T15:08:02.194901Z"
11
+ },
12
+ "collapsed": false
13
+ },
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/html": [
18
+ "<div>\n",
19
+ "<style scoped>\n",
20
+ " .dataframe tbody tr th:only-of-type {\n",
21
+ " vertical-align: middle;\n",
22
+ " }\n",
23
+ "\n",
24
+ " .dataframe tbody tr th {\n",
25
+ " vertical-align: top;\n",
26
+ " }\n",
27
+ "\n",
28
+ " .dataframe thead th {\n",
29
+ " text-align: right;\n",
30
+ " }\n",
31
+ "</style>\n",
32
+ "<table border=\"1\" class=\"dataframe\">\n",
33
+ " <thead>\n",
34
+ " <tr style=\"text-align: right;\">\n",
35
+ " <th></th>\n",
36
+ " <th>runname</th>\n",
37
+ " <th>seed</th>\n",
38
+ " <th>steps</th>\n",
39
+ " <th>agg_score</th>\n",
40
+ " <th>commonsense_qa/acc</th>\n",
41
+ " <th>commonsense_qa/acc_norm</th>\n",
42
+ " <th>hellaswag/acc</th>\n",
43
+ " <th>hellaswag/acc_norm</th>\n",
44
+ " <th>openbookqa/acc</th>\n",
45
+ " <th>openbookqa/acc_norm</th>\n",
46
+ " <th>...</th>\n",
47
+ " <th>siqa/acc</th>\n",
48
+ " <th>siqa/acc_norm</th>\n",
49
+ " <th>winogrande/acc</th>\n",
50
+ " <th>winogrande/acc_norm</th>\n",
51
+ " <th>sciq/acc</th>\n",
52
+ " <th>sciq/acc_norm</th>\n",
53
+ " <th>arc/acc</th>\n",
54
+ " <th>arc/acc_norm</th>\n",
55
+ " <th>mmlu/acc</th>\n",
56
+ " <th>mmlu/acc_norm</th>\n",
57
+ " </tr>\n",
58
+ " </thead>\n",
59
+ " <tbody>\n",
60
+ " <tr>\n",
61
+ " <th>0</th>\n",
62
+ " <td>big-run-refinedweb</td>\n",
63
+ " <td>6</td>\n",
64
+ " <td>0</td>\n",
65
+ " <td>0.330893</td>\n",
66
+ " <td>0.186</td>\n",
67
+ " <td>0.233</td>\n",
68
+ " <td>0.272</td>\n",
69
+ " <td>0.258</td>\n",
70
+ " <td>0.166</td>\n",
71
+ " <td>0.286</td>\n",
72
+ " <td>...</td>\n",
73
+ " <td>0.367</td>\n",
74
+ " <td>0.362</td>\n",
75
+ " <td>0.516</td>\n",
76
+ " <td>0.497</td>\n",
77
+ " <td>0.208</td>\n",
78
+ " <td>0.202</td>\n",
79
+ " <td>0.2195</td>\n",
80
+ " <td>0.2510</td>\n",
81
+ " <td>0.230294</td>\n",
82
+ " <td>0.250147</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>1</th>\n",
86
+ " <td>big-run-refinedweb</td>\n",
87
+ " <td>6</td>\n",
88
+ " <td>1000</td>\n",
89
+ " <td>0.353481</td>\n",
90
+ " <td>0.233</td>\n",
91
+ " <td>0.253</td>\n",
92
+ " <td>0.288</td>\n",
93
+ " <td>0.276</td>\n",
94
+ " <td>0.120</td>\n",
95
+ " <td>0.256</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>0.365</td>\n",
98
+ " <td>0.398</td>\n",
99
+ " <td>0.502</td>\n",
100
+ " <td>0.500</td>\n",
101
+ " <td>0.582</td>\n",
102
+ " <td>0.528</td>\n",
103
+ " <td>0.2650</td>\n",
104
+ " <td>0.2900</td>\n",
105
+ " <td>0.240583</td>\n",
106
+ " <td>0.252852</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>big-run-refinedweb</td>\n",
111
+ " <td>6</td>\n",
112
+ " <td>2000</td>\n",
113
+ " <td>0.376461</td>\n",
114
+ " <td>0.282</td>\n",
115
+ " <td>0.280</td>\n",
116
+ " <td>0.315</td>\n",
117
+ " <td>0.328</td>\n",
118
+ " <td>0.154</td>\n",
119
+ " <td>0.284</td>\n",
120
+ " <td>...</td>\n",
121
+ " <td>0.368</td>\n",
122
+ " <td>0.390</td>\n",
123
+ " <td>0.511</td>\n",
124
+ " <td>0.498</td>\n",
125
+ " <td>0.683</td>\n",
126
+ " <td>0.590</td>\n",
127
+ " <td>0.3055</td>\n",
128
+ " <td>0.3170</td>\n",
129
+ " <td>0.245067</td>\n",
130
+ " <td>0.261686</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>3</th>\n",
134
+ " <td>big-run-refinedweb</td>\n",
135
+ " <td>6</td>\n",
136
+ " <td>3000</td>\n",
137
+ " <td>0.387825</td>\n",
138
+ " <td>0.282</td>\n",
139
+ " <td>0.287</td>\n",
140
+ " <td>0.331</td>\n",
141
+ " <td>0.350</td>\n",
142
+ " <td>0.152</td>\n",
143
+ " <td>0.306</td>\n",
144
+ " <td>...</td>\n",
145
+ " <td>0.376</td>\n",
146
+ " <td>0.386</td>\n",
147
+ " <td>0.512</td>\n",
148
+ " <td>0.495</td>\n",
149
+ " <td>0.748</td>\n",
150
+ " <td>0.646</td>\n",
151
+ " <td>0.3210</td>\n",
152
+ " <td>0.3410</td>\n",
153
+ " <td>0.250268</td>\n",
154
+ " <td>0.266600</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>4</th>\n",
158
+ " <td>big-run-refinedweb</td>\n",
159
+ " <td>6</td>\n",
160
+ " <td>4000</td>\n",
161
+ " <td>0.398105</td>\n",
162
+ " <td>0.310</td>\n",
163
+ " <td>0.318</td>\n",
164
+ " <td>0.340</td>\n",
165
+ " <td>0.389</td>\n",
166
+ " <td>0.168</td>\n",
167
+ " <td>0.306</td>\n",
168
+ " <td>...</td>\n",
169
+ " <td>0.371</td>\n",
170
+ " <td>0.392</td>\n",
171
+ " <td>0.513</td>\n",
172
+ " <td>0.495</td>\n",
173
+ " <td>0.736</td>\n",
174
+ " <td>0.634</td>\n",
175
+ " <td>0.3305</td>\n",
176
+ " <td>0.3425</td>\n",
177
+ " <td>0.250732</td>\n",
178
+ " <td>0.268341</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>...</th>\n",
182
+ " <td>...</td>\n",
183
+ " <td>...</td>\n",
184
+ " <td>...</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " <td>...</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>...</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " </tr>\n",
204
+ " <tr>\n",
205
+ " <th>1339</th>\n",
206
+ " <td>big-run-url_dedups_lowercase_char_length</td>\n",
207
+ " <td>6</td>\n",
208
+ " <td>163000</td>\n",
209
+ " <td>0.477694</td>\n",
210
+ " <td>0.396</td>\n",
211
+ " <td>0.375</td>\n",
212
+ " <td>0.477</td>\n",
213
+ " <td>0.578</td>\n",
214
+ " <td>0.226</td>\n",
215
+ " <td>0.354</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>0.408</td>\n",
218
+ " <td>0.415</td>\n",
219
+ " <td>0.562</td>\n",
220
+ " <td>0.548</td>\n",
221
+ " <td>0.879</td>\n",
222
+ " <td>0.817</td>\n",
223
+ " <td>0.4655</td>\n",
224
+ " <td>0.4540</td>\n",
225
+ " <td>0.303672</td>\n",
226
+ " <td>0.325554</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>1340</th>\n",
230
+ " <td>big-run-url_dedups_lowercase_char_length</td>\n",
231
+ " <td>6</td>\n",
232
+ " <td>164000</td>\n",
233
+ " <td>0.476591</td>\n",
234
+ " <td>0.396</td>\n",
235
+ " <td>0.375</td>\n",
236
+ " <td>0.478</td>\n",
237
+ " <td>0.581</td>\n",
238
+ " <td>0.228</td>\n",
239
+ " <td>0.342</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>0.417</td>\n",
242
+ " <td>0.414</td>\n",
243
+ " <td>0.555</td>\n",
244
+ " <td>0.544</td>\n",
245
+ " <td>0.883</td>\n",
246
+ " <td>0.827</td>\n",
247
+ " <td>0.4600</td>\n",
248
+ " <td>0.4570</td>\n",
249
+ " <td>0.306406</td>\n",
250
+ " <td>0.329724</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>1341</th>\n",
254
+ " <td>big-run-url_dedups_lowercase_char_length</td>\n",
255
+ " <td>6</td>\n",
256
+ " <td>165000</td>\n",
257
+ " <td>0.478964</td>\n",
258
+ " <td>0.405</td>\n",
259
+ " <td>0.388</td>\n",
260
+ " <td>0.474</td>\n",
261
+ " <td>0.583</td>\n",
262
+ " <td>0.230</td>\n",
263
+ " <td>0.362</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>0.414</td>\n",
266
+ " <td>0.412</td>\n",
267
+ " <td>0.562</td>\n",
268
+ " <td>0.541</td>\n",
269
+ " <td>0.881</td>\n",
270
+ " <td>0.826</td>\n",
271
+ " <td>0.4545</td>\n",
272
+ " <td>0.4465</td>\n",
273
+ " <td>0.304121</td>\n",
274
+ " <td>0.327213</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>1342</th>\n",
278
+ " <td>big-run-url_dedups_lowercase_char_length</td>\n",
279
+ " <td>6</td>\n",
280
+ " <td>166000</td>\n",
281
+ " <td>0.477467</td>\n",
282
+ " <td>0.398</td>\n",
283
+ " <td>0.381</td>\n",
284
+ " <td>0.470</td>\n",
285
+ " <td>0.579</td>\n",
286
+ " <td>0.234</td>\n",
287
+ " <td>0.354</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>0.413</td>\n",
290
+ " <td>0.411</td>\n",
291
+ " <td>0.554</td>\n",
292
+ " <td>0.544</td>\n",
293
+ " <td>0.887</td>\n",
294
+ " <td>0.831</td>\n",
295
+ " <td>0.4625</td>\n",
296
+ " <td>0.4565</td>\n",
297
+ " <td>0.305855</td>\n",
298
+ " <td>0.328240</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>1343</th>\n",
302
+ " <td>big-run-url_dedups_lowercase_char_length</td>\n",
303
+ " <td>6</td>\n",
304
+ " <td>167000</td>\n",
305
+ " <td>0.476630</td>\n",
306
+ " <td>0.398</td>\n",
307
+ " <td>0.370</td>\n",
308
+ " <td>0.477</td>\n",
309
+ " <td>0.577</td>\n",
310
+ " <td>0.244</td>\n",
311
+ " <td>0.354</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>0.413</td>\n",
314
+ " <td>0.414</td>\n",
315
+ " <td>0.553</td>\n",
316
+ " <td>0.540</td>\n",
317
+ " <td>0.879</td>\n",
318
+ " <td>0.825</td>\n",
319
+ " <td>0.4660</td>\n",
320
+ " <td>0.4565</td>\n",
321
+ " <td>0.307940</td>\n",
322
+ " <td>0.328538</td>\n",
323
+ " </tr>\n",
324
+ " </tbody>\n",
325
+ "</table>\n",
326
+ "<p>1344 rows × 22 columns</p>\n",
327
+ "</div>"
328
+ ],
329
+ "text/plain": [
330
+ " runname seed steps agg_score \\\n",
331
+ "0 big-run-refinedweb 6 0 0.330893 \n",
332
+ "1 big-run-refinedweb 6 1000 0.353481 \n",
333
+ "2 big-run-refinedweb 6 2000 0.376461 \n",
334
+ "3 big-run-refinedweb 6 3000 0.387825 \n",
335
+ "4 big-run-refinedweb 6 4000 0.398105 \n",
336
+ "... ... ... ... ... \n",
337
+ "1339 big-run-url_dedups_lowercase_char_length 6 163000 0.477694 \n",
338
+ "1340 big-run-url_dedups_lowercase_char_length 6 164000 0.476591 \n",
339
+ "1341 big-run-url_dedups_lowercase_char_length 6 165000 0.478964 \n",
340
+ "1342 big-run-url_dedups_lowercase_char_length 6 166000 0.477467 \n",
341
+ "1343 big-run-url_dedups_lowercase_char_length 6 167000 0.476630 \n",
342
+ "\n",
343
+ " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n",
344
+ "0 0.186 0.233 0.272 \n",
345
+ "1 0.233 0.253 0.288 \n",
346
+ "2 0.282 0.280 0.315 \n",
347
+ "3 0.282 0.287 0.331 \n",
348
+ "4 0.310 0.318 0.340 \n",
349
+ "... ... ... ... \n",
350
+ "1339 0.396 0.375 0.477 \n",
351
+ "1340 0.396 0.375 0.478 \n",
352
+ "1341 0.405 0.388 0.474 \n",
353
+ "1342 0.398 0.381 0.470 \n",
354
+ "1343 0.398 0.370 0.477 \n",
355
+ "\n",
356
+ " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n",
357
+ "0 0.258 0.166 0.286 ... 0.367 \n",
358
+ "1 0.276 0.120 0.256 ... 0.365 \n",
359
+ "2 0.328 0.154 0.284 ... 0.368 \n",
360
+ "3 0.350 0.152 0.306 ... 0.376 \n",
361
+ "4 0.389 0.168 0.306 ... 0.371 \n",
362
+ "... ... ... ... ... ... \n",
363
+ "1339 0.578 0.226 0.354 ... 0.408 \n",
364
+ "1340 0.581 0.228 0.342 ... 0.417 \n",
365
+ "1341 0.583 0.230 0.362 ... 0.414 \n",
366
+ "1342 0.579 0.234 0.354 ... 0.413 \n",
367
+ "1343 0.577 0.244 0.354 ... 0.413 \n",
368
+ "\n",
369
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
370
+ "0 0.362 0.516 0.497 0.208 \n",
371
+ "1 0.398 0.502 0.500 0.582 \n",
372
+ "2 0.390 0.511 0.498 0.683 \n",
373
+ "3 0.386 0.512 0.495 0.748 \n",
374
+ "4 0.392 0.513 0.495 0.736 \n",
375
+ "... ... ... ... ... \n",
376
+ "1339 0.415 0.562 0.548 0.879 \n",
377
+ "1340 0.414 0.555 0.544 0.883 \n",
378
+ "1341 0.412 0.562 0.541 0.881 \n",
379
+ "1342 0.411 0.554 0.544 0.887 \n",
380
+ "1343 0.414 0.553 0.540 0.879 \n",
381
+ "\n",
382
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
383
+ "0 0.202 0.2195 0.2510 0.230294 0.250147 \n",
384
+ "1 0.528 0.2650 0.2900 0.240583 0.252852 \n",
385
+ "2 0.590 0.3055 0.3170 0.245067 0.261686 \n",
386
+ "3 0.646 0.3210 0.3410 0.250268 0.266600 \n",
387
+ "4 0.634 0.3305 0.3425 0.250732 0.268341 \n",
388
+ "... ... ... ... ... ... \n",
389
+ "1339 0.817 0.4655 0.4540 0.303672 0.325554 \n",
390
+ "1340 0.827 0.4600 0.4570 0.306406 0.329724 \n",
391
+ "1341 0.826 0.4545 0.4465 0.304121 0.327213 \n",
392
+ "1342 0.831 0.4625 0.4565 0.305855 0.328240 \n",
393
+ "1343 0.825 0.4660 0.4565 0.307940 0.328538 \n",
394
+ "\n",
395
+ "[1344 rows x 22 columns]"
396
+ ]
397
+ },
398
+ "execution_count": 2,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "import pandas as pd\n",
405
+ "from matplotlib.figure import Figure\n",
406
+ "\n",
407
+ "df = pd.read_csv(\"../src_data/diff_dedup_attempts.csv\")\n",
408
+ "df"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 3,
414
+ "id": "874ab88a573cd443",
415
+ "metadata": {
416
+ "ExecuteTime": {
417
+ "end_time": "2024-05-13T13:56:19.453420Z",
418
+ "start_time": "2024-05-13T13:56:19.450850Z"
419
+ }
420
+ },
421
+ "outputs": [
422
+ {
423
+ "data": {
424
+ "text/plain": [
425
+ "['big-run-refinedweb',\n",
426
+ " 'big-run-sampled_cross_minhash_dump',\n",
427
+ " 'big-run-sampled_full_filtered_no_dedup',\n",
428
+ " 'big-run-sampled_full_imh_linededup',\n",
429
+ " 'big-run-sampled_full_ind_minhash',\n",
430
+ " 'big-run-sampled_line_dedup_3lines2',\n",
431
+ " 'big-run-sampled_line_dedup_min_words',\n",
432
+ " 'big-run-url_dedups_lowercase_char_length']"
433
+ ]
434
+ },
435
+ "execution_count": 3,
436
+ "metadata": {},
437
+ "output_type": "execute_result"
438
+ }
439
+ ],
440
+ "source": [
441
+ "pd.unique(df[\"runname\"]).tolist()"
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": 4,
447
+ "id": "b610f43caefdf01",
448
+ "metadata": {
449
+ "ExecuteTime": {
450
+ "end_time": "2024-05-13T14:00:46.578560Z",
451
+ "start_time": "2024-05-13T14:00:46.576167Z"
452
+ },
453
+ "collapsed": false
454
+ },
455
+ "outputs": [],
456
+ "source": [
457
+ "runs_mapping = {\n",
458
+ " \"big-run-refinedweb\": \"RefinedWeb\",\n",
459
+ " \"big-run-sampled_cross_minhash_dump\": \"FineWeb full MinHash\",\n",
460
+ " \"big-run-sampled_full_filtered_no_dedup\": \"FineWeb filtered only\",\n",
461
+ " \"big-run-sampled_full_ind_minhash\": \"FineWeb independent MinHash\",\n",
462
+ " \"big-run-sampled_full_imh_linededup\": \"FineWeb line dedup\",\n",
463
+ " \"big-run-sampled_line_dedup_3lines2\": \"FineWeb 3-line dedup\",\n",
464
+ " \"big-run-sampled_line_dedup_min_words\": \"FineWeb line dedup w/ min words\",\n",
465
+ " \"big-run-url_dedups_lowercase_char_length\": \"FineWeb URL dedup\"\n",
466
+ "}"
467
+ ]
468
+ },
469
+ {
470
+ "cell_type": "code",
471
+ "execution_count": 5,
472
+ "id": "initial_id",
473
+ "metadata": {
474
+ "ExecuteTime": {
475
+ "end_time": "2024-05-13T14:04:41.777032Z",
476
+ "start_time": "2024-05-13T14:04:41.536919Z"
477
+ },
478
+ "collapsed": true
479
+ },
480
+ "outputs": [],
481
+ "source": [
482
+ "import json\n",
483
+ "import os\n",
484
+ "from matplotlib import pyplot as plt\n",
485
+ "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
486
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
487
+ "\n",
488
+ "def normalize_runname(runname):\n",
489
+ " return runname.replace(\"/\", \"_\")\n",
490
+ "\n",
491
+ "grouped = (\n",
492
+ " df.groupby([\"runname\", \"steps\"])\n",
493
+ " .agg(\n",
494
+ " {\n",
495
+ " key: \"mean\" for key in metrics\n",
496
+ " }\n",
497
+ " )\n",
498
+ " .reset_index()\n",
499
+ ")\n",
500
+ "\n",
501
+ "file_id=\"../assets/data/plots/dedup_attempts\"\n",
502
+ "files = {}\n",
503
+ "for metric in metrics:\n",
504
+ " datas = {}\n",
505
+ " for name, group in grouped.groupby(\"runname\"):\n",
506
+ " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n",
507
+ " group = group.set_index(\"steps\")\n",
508
+ " rolling_avg = group\n",
509
+ " # rolling_avg = group.rolling(wjjjjjjjjjjjjjindow=5).mean()\n",
510
+ " datas[name] = {\n",
511
+ " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n",
512
+ " \"y\": rolling_avg[metric].tolist(),\n",
513
+ " \"label\": runs_mapping[name],\n",
514
+ " }\n",
515
+ " # Sort the datata based on the steps\n",
516
+ " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n",
517
+ " # Create a folder\n",
518
+ " os.makedirs(f\"{file_id}\", exist_ok=True)\n",
519
+ " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n",
520
+ " json.dump({\n",
521
+ " \"data\": datas,\n",
522
+ " \"layout\": {\n",
523
+ " \"title\": {\n",
524
+ " \"text\": \"Attempting to further globally dedup worsened perf\"\n",
525
+ " },\n",
526
+ " }\n",
527
+ " }, f)\n",
528
+ " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n",
529
+ "# Create index\n",
530
+ "with open(f\"{file_id}/index.json\", \"w\") as f:\n",
531
+ " json.dump({\n",
532
+ " \"files\": files,\n",
533
+ " \"settings\": {\n",
534
+ " \"defaultMetric\": \"agg_score\",\n",
535
+ " \"slider\":{\"min\":0,\"max\":30,\"default\":5}\n",
536
+ " }\n",
537
+ " }, f)\n",
538
+ " \n",
539
+ " "
540
+ ]
541
+ },
542
+ {
543
+ "cell_type": "code",
544
+ "execution_count": 4,
545
+ "id": "af28ebbd054cdc33",
546
+ "metadata": {
547
+ "ExecuteTime": {
548
+ "end_time": "2024-04-30T15:08:02.522543Z",
549
+ "start_time": "2024-04-30T15:08:02.520569Z"
550
+ },
551
+ "collapsed": false
552
+ },
553
+ "outputs": [],
554
+ "source": []
555
+ }
556
+ ],
557
+ "metadata": {
558
+ "kernelspec": {
559
+ "display_name": "Python 3",
560
+ "language": "python",
561
+ "name": "python3"
562
+ },
563
+ "language_info": {
564
+ "codemirror_mode": {
565
+ "name": "ipython",
566
+ "version": 3
567
+ },
568
+ "file_extension": ".py",
569
+ "mimetype": "text/x-python",
570
+ "name": "python",
571
+ "nbconvert_exporter": "python",
572
+ "pygments_lexer": "ipython3",
573
+ "version": "3.12.2"
574
+ }
575
+ },
576
+ "nbformat": 4,
577
+ "nbformat_minor": 5
578
+ }
notebooks/plot_dedup_ind_dedup_better.ipynb ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 19,
6
+ "id": "138889b92720ce2e",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2024-04-30T15:08:02.398435Z",
10
+ "start_time": "2024-04-30T15:08:02.194901Z"
11
+ },
12
+ "collapsed": false
13
+ },
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/html": [
18
+ "<div>\n",
19
+ "<style scoped>\n",
20
+ " .dataframe tbody tr th:only-of-type {\n",
21
+ " vertical-align: middle;\n",
22
+ " }\n",
23
+ "\n",
24
+ " .dataframe tbody tr th {\n",
25
+ " vertical-align: top;\n",
26
+ " }\n",
27
+ "\n",
28
+ " .dataframe thead th {\n",
29
+ " text-align: right;\n",
30
+ " }\n",
31
+ "</style>\n",
32
+ "<table border=\"1\" class=\"dataframe\">\n",
33
+ " <thead>\n",
34
+ " <tr style=\"text-align: right;\">\n",
35
+ " <th></th>\n",
36
+ " <th>runname</th>\n",
37
+ " <th>seed</th>\n",
38
+ " <th>steps</th>\n",
39
+ " <th>agg_score</th>\n",
40
+ " <th>commonsense_qa/acc</th>\n",
41
+ " <th>commonsense_qa/acc_norm</th>\n",
42
+ " <th>hellaswag/acc</th>\n",
43
+ " <th>hellaswag/acc_norm</th>\n",
44
+ " <th>openbookqa/acc</th>\n",
45
+ " <th>openbookqa/acc_norm</th>\n",
46
+ " <th>...</th>\n",
47
+ " <th>siqa/acc</th>\n",
48
+ " <th>siqa/acc_norm</th>\n",
49
+ " <th>winogrande/acc</th>\n",
50
+ " <th>winogrande/acc_norm</th>\n",
51
+ " <th>sciq/acc</th>\n",
52
+ " <th>sciq/acc_norm</th>\n",
53
+ " <th>arc/acc</th>\n",
54
+ " <th>arc/acc_norm</th>\n",
55
+ " <th>mmlu/acc</th>\n",
56
+ " <th>mmlu/acc_norm</th>\n",
57
+ " </tr>\n",
58
+ " </thead>\n",
59
+ " <tbody>\n",
60
+ " <tr>\n",
61
+ " <th>0</th>\n",
62
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
63
+ " <td>6</td>\n",
64
+ " <td>0</td>\n",
65
+ " <td>0.330893</td>\n",
66
+ " <td>0.186</td>\n",
67
+ " <td>0.233</td>\n",
68
+ " <td>0.272</td>\n",
69
+ " <td>0.258</td>\n",
70
+ " <td>0.166</td>\n",
71
+ " <td>0.286</td>\n",
72
+ " <td>...</td>\n",
73
+ " <td>0.367</td>\n",
74
+ " <td>0.362</td>\n",
75
+ " <td>0.516</td>\n",
76
+ " <td>0.497</td>\n",
77
+ " <td>0.209</td>\n",
78
+ " <td>0.202</td>\n",
79
+ " <td>0.2195</td>\n",
80
+ " <td>0.2510</td>\n",
81
+ " <td>0.230294</td>\n",
82
+ " <td>0.250147</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>1</th>\n",
86
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
87
+ " <td>6</td>\n",
88
+ " <td>1000</td>\n",
89
+ " <td>0.360520</td>\n",
90
+ " <td>0.254</td>\n",
91
+ " <td>0.260</td>\n",
92
+ " <td>0.290</td>\n",
93
+ " <td>0.281</td>\n",
94
+ " <td>0.138</td>\n",
95
+ " <td>0.256</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>0.362</td>\n",
98
+ " <td>0.400</td>\n",
99
+ " <td>0.517</td>\n",
100
+ " <td>0.524</td>\n",
101
+ " <td>0.573</td>\n",
102
+ " <td>0.515</td>\n",
103
+ " <td>0.2675</td>\n",
104
+ " <td>0.2895</td>\n",
105
+ " <td>0.239489</td>\n",
106
+ " <td>0.251660</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
111
+ " <td>6</td>\n",
112
+ " <td>2000</td>\n",
113
+ " <td>0.373315</td>\n",
114
+ " <td>0.285</td>\n",
115
+ " <td>0.278</td>\n",
116
+ " <td>0.315</td>\n",
117
+ " <td>0.323</td>\n",
118
+ " <td>0.138</td>\n",
119
+ " <td>0.272</td>\n",
120
+ " <td>...</td>\n",
121
+ " <td>0.365</td>\n",
122
+ " <td>0.395</td>\n",
123
+ " <td>0.509</td>\n",
124
+ " <td>0.490</td>\n",
125
+ " <td>0.677</td>\n",
126
+ " <td>0.596</td>\n",
127
+ " <td>0.3075</td>\n",
128
+ " <td>0.3235</td>\n",
129
+ " <td>0.250318</td>\n",
130
+ " <td>0.261019</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>3</th>\n",
134
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
135
+ " <td>6</td>\n",
136
+ " <td>3000</td>\n",
137
+ " <td>0.388201</td>\n",
138
+ " <td>0.294</td>\n",
139
+ " <td>0.291</td>\n",
140
+ " <td>0.327</td>\n",
141
+ " <td>0.341</td>\n",
142
+ " <td>0.152</td>\n",
143
+ " <td>0.298</td>\n",
144
+ " <td>...</td>\n",
145
+ " <td>0.371</td>\n",
146
+ " <td>0.396</td>\n",
147
+ " <td>0.512</td>\n",
148
+ " <td>0.504</td>\n",
149
+ " <td>0.712</td>\n",
150
+ " <td>0.621</td>\n",
151
+ " <td>0.3220</td>\n",
152
+ " <td>0.3390</td>\n",
153
+ " <td>0.255646</td>\n",
154
+ " <td>0.266605</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>4</th>\n",
158
+ " <td>big-run-sampled_full_filtered_no_dedup</td>\n",
159
+ " <td>6</td>\n",
160
+ " <td>4000</td>\n",
161
+ " <td>0.393412</td>\n",
162
+ " <td>0.306</td>\n",
163
+ " <td>0.307</td>\n",
164
+ " <td>0.337</td>\n",
165
+ " <td>0.360</td>\n",
166
+ " <td>0.172</td>\n",
167
+ " <td>0.284</td>\n",
168
+ " <td>...</td>\n",
169
+ " <td>0.380</td>\n",
170
+ " <td>0.402</td>\n",
171
+ " <td>0.522</td>\n",
172
+ " <td>0.510</td>\n",
173
+ " <td>0.729</td>\n",
174
+ " <td>0.612</td>\n",
175
+ " <td>0.3100</td>\n",
176
+ " <td>0.3385</td>\n",
177
+ " <td>0.253048</td>\n",
178
+ " <td>0.266798</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>...</th>\n",
182
+ " <td>...</td>\n",
183
+ " <td>...</td>\n",
184
+ " <td>...</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " <td>...</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>...</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " </tr>\n",
204
+ " <tr>\n",
205
+ " <th>670</th>\n",
206
+ " <td>big-run-sampled_full_ind_minhash</td>\n",
207
+ " <td>6</td>\n",
208
+ " <td>163000</td>\n",
209
+ " <td>0.481842</td>\n",
210
+ " <td>0.427</td>\n",
211
+ " <td>0.393</td>\n",
212
+ " <td>0.488</td>\n",
213
+ " <td>0.579</td>\n",
214
+ " <td>0.242</td>\n",
215
+ " <td>0.358</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>0.420</td>\n",
218
+ " <td>0.397</td>\n",
219
+ " <td>0.587</td>\n",
220
+ " <td>0.568</td>\n",
221
+ " <td>0.885</td>\n",
222
+ " <td>0.809</td>\n",
223
+ " <td>0.4760</td>\n",
224
+ " <td>0.4595</td>\n",
225
+ " <td>0.305843</td>\n",
226
+ " <td>0.330238</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>671</th>\n",
230
+ " <td>big-run-sampled_full_ind_minhash</td>\n",
231
+ " <td>6</td>\n",
232
+ " <td>164000</td>\n",
233
+ " <td>0.482727</td>\n",
234
+ " <td>0.426</td>\n",
235
+ " <td>0.394</td>\n",
236
+ " <td>0.487</td>\n",
237
+ " <td>0.582</td>\n",
238
+ " <td>0.238</td>\n",
239
+ " <td>0.360</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>0.422</td>\n",
242
+ " <td>0.398</td>\n",
243
+ " <td>0.575</td>\n",
244
+ " <td>0.562</td>\n",
245
+ " <td>0.885</td>\n",
246
+ " <td>0.827</td>\n",
247
+ " <td>0.4745</td>\n",
248
+ " <td>0.4625</td>\n",
249
+ " <td>0.307377</td>\n",
250
+ " <td>0.332317</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>672</th>\n",
254
+ " <td>big-run-sampled_full_ind_minhash</td>\n",
255
+ " <td>6</td>\n",
256
+ " <td>165000</td>\n",
257
+ " <td>0.482413</td>\n",
258
+ " <td>0.423</td>\n",
259
+ " <td>0.397</td>\n",
260
+ " <td>0.482</td>\n",
261
+ " <td>0.573</td>\n",
262
+ " <td>0.238</td>\n",
263
+ " <td>0.360</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>0.409</td>\n",
266
+ " <td>0.396</td>\n",
267
+ " <td>0.581</td>\n",
268
+ " <td>0.569</td>\n",
269
+ " <td>0.889</td>\n",
270
+ " <td>0.829</td>\n",
271
+ " <td>0.4675</td>\n",
272
+ " <td>0.4600</td>\n",
273
+ " <td>0.308059</td>\n",
274
+ " <td>0.331304</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>673</th>\n",
278
+ " <td>big-run-sampled_full_ind_minhash</td>\n",
279
+ " <td>6</td>\n",
280
+ " <td>166000</td>\n",
281
+ " <td>0.482014</td>\n",
282
+ " <td>0.422</td>\n",
283
+ " <td>0.391</td>\n",
284
+ " <td>0.477</td>\n",
285
+ " <td>0.573</td>\n",
286
+ " <td>0.230</td>\n",
287
+ " <td>0.358</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>0.420</td>\n",
290
+ " <td>0.400</td>\n",
291
+ " <td>0.586</td>\n",
292
+ " <td>0.566</td>\n",
293
+ " <td>0.883</td>\n",
294
+ " <td>0.817</td>\n",
295
+ " <td>0.4660</td>\n",
296
+ " <td>0.4645</td>\n",
297
+ " <td>0.304975</td>\n",
298
+ " <td>0.329611</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>674</th>\n",
302
+ " <td>big-run-sampled_full_ind_minhash</td>\n",
303
+ " <td>6</td>\n",
304
+ " <td>167000</td>\n",
305
+ " <td>0.486587</td>\n",
306
+ " <td>0.424</td>\n",
307
+ " <td>0.402</td>\n",
308
+ " <td>0.490</td>\n",
309
+ " <td>0.579</td>\n",
310
+ " <td>0.236</td>\n",
311
+ " <td>0.360</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>0.417</td>\n",
314
+ " <td>0.405</td>\n",
315
+ " <td>0.585</td>\n",
316
+ " <td>0.575</td>\n",
317
+ " <td>0.884</td>\n",
318
+ " <td>0.832</td>\n",
319
+ " <td>0.4760</td>\n",
320
+ " <td>0.4715</td>\n",
321
+ " <td>0.309503</td>\n",
322
+ " <td>0.332197</td>\n",
323
+ " </tr>\n",
324
+ " </tbody>\n",
325
+ "</table>\n",
326
+ "<p>675 rows × 22 columns</p>\n",
327
+ "</div>"
328
+ ],
329
+ "text/plain": [
330
+ " runname seed steps agg_score \\\n",
331
+ "0 big-run-sampled_full_filtered_no_dedup 6 0 0.330893 \n",
332
+ "1 big-run-sampled_full_filtered_no_dedup 6 1000 0.360520 \n",
333
+ "2 big-run-sampled_full_filtered_no_dedup 6 2000 0.373315 \n",
334
+ "3 big-run-sampled_full_filtered_no_dedup 6 3000 0.388201 \n",
335
+ "4 big-run-sampled_full_filtered_no_dedup 6 4000 0.393412 \n",
336
+ ".. ... ... ... ... \n",
337
+ "670 big-run-sampled_full_ind_minhash 6 163000 0.481842 \n",
338
+ "671 big-run-sampled_full_ind_minhash 6 164000 0.482727 \n",
339
+ "672 big-run-sampled_full_ind_minhash 6 165000 0.482413 \n",
340
+ "673 big-run-sampled_full_ind_minhash 6 166000 0.482014 \n",
341
+ "674 big-run-sampled_full_ind_minhash 6 167000 0.486587 \n",
342
+ "\n",
343
+ " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n",
344
+ "0 0.186 0.233 0.272 \n",
345
+ "1 0.254 0.260 0.290 \n",
346
+ "2 0.285 0.278 0.315 \n",
347
+ "3 0.294 0.291 0.327 \n",
348
+ "4 0.306 0.307 0.337 \n",
349
+ ".. ... ... ... \n",
350
+ "670 0.427 0.393 0.488 \n",
351
+ "671 0.426 0.394 0.487 \n",
352
+ "672 0.423 0.397 0.482 \n",
353
+ "673 0.422 0.391 0.477 \n",
354
+ "674 0.424 0.402 0.490 \n",
355
+ "\n",
356
+ " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n",
357
+ "0 0.258 0.166 0.286 ... 0.367 \n",
358
+ "1 0.281 0.138 0.256 ... 0.362 \n",
359
+ "2 0.323 0.138 0.272 ... 0.365 \n",
360
+ "3 0.341 0.152 0.298 ... 0.371 \n",
361
+ "4 0.360 0.172 0.284 ... 0.380 \n",
362
+ ".. ... ... ... ... ... \n",
363
+ "670 0.579 0.242 0.358 ... 0.420 \n",
364
+ "671 0.582 0.238 0.360 ... 0.422 \n",
365
+ "672 0.573 0.238 0.360 ... 0.409 \n",
366
+ "673 0.573 0.230 0.358 ... 0.420 \n",
367
+ "674 0.579 0.236 0.360 ... 0.417 \n",
368
+ "\n",
369
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
370
+ "0 0.362 0.516 0.497 0.209 \n",
371
+ "1 0.400 0.517 0.524 0.573 \n",
372
+ "2 0.395 0.509 0.490 0.677 \n",
373
+ "3 0.396 0.512 0.504 0.712 \n",
374
+ "4 0.402 0.522 0.510 0.729 \n",
375
+ ".. ... ... ... ... \n",
376
+ "670 0.397 0.587 0.568 0.885 \n",
377
+ "671 0.398 0.575 0.562 0.885 \n",
378
+ "672 0.396 0.581 0.569 0.889 \n",
379
+ "673 0.400 0.586 0.566 0.883 \n",
380
+ "674 0.405 0.585 0.575 0.884 \n",
381
+ "\n",
382
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
383
+ "0 0.202 0.2195 0.2510 0.230294 0.250147 \n",
384
+ "1 0.515 0.2675 0.2895 0.239489 0.251660 \n",
385
+ "2 0.596 0.3075 0.3235 0.250318 0.261019 \n",
386
+ "3 0.621 0.3220 0.3390 0.255646 0.266605 \n",
387
+ "4 0.612 0.3100 0.3385 0.253048 0.266798 \n",
388
+ ".. ... ... ... ... ... \n",
389
+ "670 0.809 0.4760 0.4595 0.305843 0.330238 \n",
390
+ "671 0.827 0.4745 0.4625 0.307377 0.332317 \n",
391
+ "672 0.829 0.4675 0.4600 0.308059 0.331304 \n",
392
+ "673 0.817 0.4660 0.4645 0.304975 0.329611 \n",
393
+ "674 0.832 0.4760 0.4715 0.309503 0.332197 \n",
394
+ "\n",
395
+ "[675 rows x 22 columns]"
396
+ ]
397
+ },
398
+ "execution_count": 19,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "import pandas as pd\n",
405
+ "from matplotlib.figure import Figure\n",
406
+ "\n",
407
+ "df = pd.read_csv(\"../src_data/cross_ind_unfiltered_comparison.csv\")\n",
408
+ "df"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 20,
414
+ "id": "b610f43caefdf01",
415
+ "metadata": {
416
+ "ExecuteTime": {
417
+ "end_time": "2024-04-30T15:08:02.401852Z",
418
+ "start_time": "2024-04-30T15:08:02.399712Z"
419
+ },
420
+ "collapsed": false
421
+ },
422
+ "outputs": [],
423
+ "source": [
424
+ "runs_mapping = {\n",
425
+ " \"big-run-refinedweb\": \"RefinedWeb\",\n",
426
+ " \"big-run-fineweb-cross-dedup-fixed\": \"FineWeb full MinHash\",\n",
427
+ " \"big-run-sampled_full_ind_minhash\": \"FineWeb independent MinHash\",\n",
428
+ " \"big-run-sampled_full_filtered_no_dedup\": \"FineWeb filtered only\"\n",
429
+ "}"
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "code",
434
+ "execution_count": 21,
435
+ "id": "initial_id",
436
+ "metadata": {
437
+ "ExecuteTime": {
438
+ "end_time": "2024-04-30T15:08:02.519228Z",
439
+ "start_time": "2024-04-30T15:08:02.402938Z"
440
+ },
441
+ "collapsed": true
442
+ },
443
+ "outputs": [
444
+ {
445
+ "name": "stderr",
446
+ "output_type": "stream",
447
+ "text": [
448
+ "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n"
449
+ ]
450
+ },
451
+ {
452
+ "data": {
453
+ "image/png": "",
454
+ "text/plain": [
455
+ "<Figure size 640x480 with 1 Axes>"
456
+ ]
457
+ },
458
+ "metadata": {},
459
+ "output_type": "display_data"
460
+ }
461
+ ],
462
+ "source": [
463
+ "from matplotlib import pyplot as plt\n",
464
+ "\n",
465
+ "import json\n",
466
+ "import os\n",
467
+ "from matplotlib import pyplot as plt\n",
468
+ "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
469
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
470
+ "\n",
471
+ "def normalize_runname(runname):\n",
472
+ " return runname.replace(\"/\", \"_\")\n",
473
+ "\n",
474
+ "grouped = (\n",
475
+ " df.groupby([\"runname\", \"steps\"])\n",
476
+ " .agg(\n",
477
+ " {\n",
478
+ " key: \"mean\" for key in metrics\n",
479
+ " }\n",
480
+ " )\n",
481
+ " .reset_index()\n",
482
+ ")\n",
483
+ "\n",
484
+ "file_id=\"../assets/data/plots/ind_dedup_better\"\n",
485
+ "files = {}\n",
486
+ "for metric in metrics:\n",
487
+ " datas = {}\n",
488
+ " for name, group in grouped.groupby(\"runname\"):\n",
489
+ " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n",
490
+ " group = group.set_index(\"steps\")\n",
491
+ " rolling_avg = group\n",
492
+ " # rolling_avg = group.rolling(wjjjjjjjjjjjjjindow=5).mean()\n",
493
+ " datas[name] = {\n",
494
+ " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n",
495
+ " \"y\": rolling_avg[metric].tolist(),\n",
496
+ " \"label\": runs_mapping[name],\n",
497
+ " }\n",
498
+ " # Sort the datata based on the steps\n",
499
+ " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n",
500
+ " # Create a folder\n",
501
+ " os.makedirs(f\"{file_id}\", exist_ok=True)\n",
502
+ " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n",
503
+ " json.dump({\n",
504
+ " \"data\": datas,\n",
505
+ " \"layout\": {\n",
506
+ " \"title\": {\n",
507
+ " \"text\": \"Independent dedup outperforms dedup across dumps\"\n",
508
+ " },\n",
509
+ " }\n",
510
+ " }, f)\n",
511
+ " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n",
512
+ "# Create index\n",
513
+ "with open(f\"{file_id}/index.json\", \"w\") as f:\n",
514
+ " json.dump({\n",
515
+ " \"files\": files,\n",
516
+ " \"settings\": {\n",
517
+ " \"defaultMetric\": \"agg_score\",\n",
518
+ " \"slider\":{\"min\":0,\"max\":30,\"default\":5}\n",
519
+ " }\n",
520
+ " }, f)\n",
521
+ " \n",
522
+ "\n",
523
+ " \n",
524
+ "# Add labels and legend\n",
525
+ "plt.xlabel('Training tokens (billions)')\n",
526
+ "plt.ylabel('Agg Score')\n",
527
+ "plt.title('Independent dedup outperforms dedup across dumps')\n",
528
+ "plt.legend()\n",
529
+ "\n",
530
+ "# Show the plot\n",
531
+ "plt.show()"
532
+ ]
533
+ },
534
+ {
535
+ "cell_type": "code",
536
+ "execution_count": 4,
537
+ "id": "af28ebbd054cdc33",
538
+ "metadata": {
539
+ "ExecuteTime": {
540
+ "end_time": "2024-04-30T15:08:02.522543Z",
541
+ "start_time": "2024-04-30T15:08:02.520569Z"
542
+ },
543
+ "collapsed": false
544
+ },
545
+ "outputs": [],
546
+ "source": []
547
+ }
548
+ ],
549
+ "metadata": {
550
+ "kernelspec": {
551
+ "display_name": "Python 3",
552
+ "language": "python",
553
+ "name": "python3"
554
+ },
555
+ "language_info": {
556
+ "codemirror_mode": {
557
+ "name": "ipython",
558
+ "version": 3
559
+ },
560
+ "file_extension": ".py",
561
+ "mimetype": "text/x-python",
562
+ "name": "python",
563
+ "nbconvert_exporter": "python",
564
+ "pygments_lexer": "ipython3",
565
+ "version": "3.12.2"
566
+ }
567
+ },
568
+ "nbformat": 4,
569
+ "nbformat_minor": 5
570
+ }
notebooks/plot_dedup_simul.ipynb ADDED
@@ -0,0 +1,1420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 32,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import json\n",
10
+ "\n",
11
+ "\n",
12
+ "def normalize_run_name(run_name):\n",
13
+ " return run_name.replace(\"/\", \"_\")\n",
14
+ "\n",
15
+ "def save_for_plot(dir_name, df, run_names, xlabel=\"Dataset\", ylabel=\"Matched as dups probability\", plot_name=\"plot name\", custom_layout={}, ranges={}, x_column=None, default_metric=None):\n",
16
+ " import os\n",
17
+ " files = {}\n",
18
+ " os.makedirs(f\"data/plots/{dir_name}\", exist_ok=True)\n",
19
+ " data = {}\n",
20
+ " for run_name in run_names:\n",
21
+ " data[run_name] = {\n",
22
+ " \"x\": df[x_column].tolist() if x_column else [run_name],\n",
23
+ " \"y\": df[run_name].tolist(),\n",
24
+ " \"label\": run_name,\n",
25
+ " }\n",
26
+ " file_name = f\"default.json\"\n",
27
+ " files[\"default\"] = {\"file\": f\"{file_name}\"}\n",
28
+ " with open(f\"data/plots/{dir_name}/{file_name}\", \"w\") as f:\n",
29
+ " json.dump({\n",
30
+ " \"data\": data,\n",
31
+ " \"layout\": {\n",
32
+ " \"title\": {\n",
33
+ " \"text\": plot_name,\n",
34
+ " },\n",
35
+ " \"xaxis\": {\n",
36
+ " \"title\": {\n",
37
+ " \"text\": xlabel,\n",
38
+ " },\n",
39
+ " },\n",
40
+ " \"yaxis\": {\n",
41
+ " # \"range\": ranges.get(view, None),\n",
42
+ " \"title\": {\n",
43
+ " \"text\": ylabel,\n",
44
+ " },\n",
45
+ " },\n",
46
+ " **custom_layout,\n",
47
+ " }\n",
48
+ " }, f)\n",
49
+ " with open(f\"data/plots/{dir_name}/index.json\", \"w\") as f:\n",
50
+ " json.dump({\n",
51
+ " \"files\": files,\n",
52
+ " \"settings\": {\n",
53
+ " \"defaultMetric\": default_metric,\n",
54
+ " \"slider\": None,\n",
55
+ " \"autoSetXRange\": False,\n",
56
+ " \"type\": \"bar\"\n",
57
+ " }\n",
58
+ " }, f)\n",
59
+ " return files\n",
60
+ "\n"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 9,
66
+ "metadata": {},
67
+ "outputs": [
68
+ {
69
+ "data": {
70
+ "text/html": [
71
+ "<div>\n",
72
+ "<style scoped>\n",
73
+ " .dataframe tbody tr th:only-of-type {\n",
74
+ " vertical-align: middle;\n",
75
+ " }\n",
76
+ "\n",
77
+ " .dataframe tbody tr th {\n",
78
+ " vertical-align: top;\n",
79
+ " }\n",
80
+ "\n",
81
+ " .dataframe thead th {\n",
82
+ " text-align: right;\n",
83
+ " }\n",
84
+ "</style>\n",
85
+ "<table border=\"1\" class=\"dataframe\">\n",
86
+ " <thead>\n",
87
+ " <tr style=\"text-align: right;\">\n",
88
+ " <th></th>\n",
89
+ " <th>1</th>\n",
90
+ " <th>2</th>\n",
91
+ " <th>3</th>\n",
92
+ " <th>4-8</th>\n",
93
+ " <th>8-16</th>\n",
94
+ " <th>16-32</th>\n",
95
+ " </tr>\n",
96
+ " </thead>\n",
97
+ " <tbody>\n",
98
+ " <tr>\n",
99
+ " <th>1B</th>\n",
100
+ " <td>0.994974</td>\n",
101
+ " <td>0.005008</td>\n",
102
+ " <td>0.000018</td>\n",
103
+ " <td>0.0</td>\n",
104
+ " <td>0.0</td>\n",
105
+ " <td>0.0</td>\n",
106
+ " </tr>\n",
107
+ " <tr>\n",
108
+ " <th>10B</th>\n",
109
+ " <td>0.951508</td>\n",
110
+ " <td>0.047331</td>\n",
111
+ " <td>0.001144</td>\n",
112
+ " <td>0.000017</td>\n",
113
+ " <td>0.0</td>\n",
114
+ " <td>0.0</td>\n",
115
+ " </tr>\n",
116
+ " <tr>\n",
117
+ " <th>100B</th>\n",
118
+ " <td>0.608873</td>\n",
119
+ " <td>0.302822</td>\n",
120
+ " <td>0.074548</td>\n",
121
+ " <td>0.013757</td>\n",
122
+ " <td>0.0</td>\n",
123
+ " <td>0.0</td>\n",
124
+ " </tr>\n",
125
+ " <tr>\n",
126
+ " <th>350B</th>\n",
127
+ " <td>0.174147</td>\n",
128
+ " <td>0.30712</td>\n",
129
+ " <td>0.268018</td>\n",
130
+ " <td>0.250649</td>\n",
131
+ " <td>0.000065</td>\n",
132
+ " <td>0.0</td>\n",
133
+ " </tr>\n",
134
+ " <tr>\n",
135
+ " <th>1T</th>\n",
136
+ " <td>0.006232</td>\n",
137
+ " <td>0.03247</td>\n",
138
+ " <td>0.083743</td>\n",
139
+ " <td>0.817636</td>\n",
140
+ " <td>0.05991</td>\n",
141
+ " <td>0.000008</td>\n",
142
+ " </tr>\n",
143
+ " </tbody>\n",
144
+ "</table>\n",
145
+ "</div>"
146
+ ],
147
+ "text/plain": [
148
+ " 1 2 3 4-8 8-16 16-32\n",
149
+ "1B 0.994974 0.005008 0.000018 0.0 0.0 0.0\n",
150
+ "10B 0.951508 0.047331 0.001144 0.000017 0.0 0.0\n",
151
+ "100B 0.608873 0.302822 0.074548 0.013757 0.0 0.0\n",
152
+ "350B 0.174147 0.30712 0.268018 0.250649 0.000065 0.0\n",
153
+ "1T 0.006232 0.03247 0.083743 0.817636 0.05991 0.000008"
154
+ ]
155
+ },
156
+ "execution_count": 9,
157
+ "metadata": {},
158
+ "output_type": "execute_result"
159
+ }
160
+ ],
161
+ "source": []
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": 28,
166
+ "metadata": {},
167
+ "outputs": [
168
+ {
169
+ "data": {
170
+ "text/html": [
171
+ "<div>\n",
172
+ "<style scoped>\n",
173
+ " .dataframe tbody tr th:only-of-type {\n",
174
+ " vertical-align: middle;\n",
175
+ " }\n",
176
+ "\n",
177
+ " .dataframe tbody tr th {\n",
178
+ " vertical-align: top;\n",
179
+ " }\n",
180
+ "\n",
181
+ " .dataframe thead th {\n",
182
+ " text-align: right;\n",
183
+ " }\n",
184
+ "</style>\n",
185
+ "<table border=\"1\" class=\"dataframe\">\n",
186
+ " <thead>\n",
187
+ " <tr style=\"text-align: right;\">\n",
188
+ " <th></th>\n",
189
+ " <th>index</th>\n",
190
+ " <th>1</th>\n",
191
+ " <th>2</th>\n",
192
+ " <th>3</th>\n",
193
+ " <th>4-8</th>\n",
194
+ " <th>8-16</th>\n",
195
+ " <th>16-32</th>\n",
196
+ " </tr>\n",
197
+ " </thead>\n",
198
+ " <tbody>\n",
199
+ " <tr>\n",
200
+ " <th>0</th>\n",
201
+ " <td>1B</td>\n",
202
+ " <td>0.994974</td>\n",
203
+ " <td>0.005008</td>\n",
204
+ " <td>0.000018</td>\n",
205
+ " <td>0.0</td>\n",
206
+ " <td>0.0</td>\n",
207
+ " <td>0.0</td>\n",
208
+ " </tr>\n",
209
+ " <tr>\n",
210
+ " <th>1</th>\n",
211
+ " <td>10B</td>\n",
212
+ " <td>0.951508</td>\n",
213
+ " <td>0.047331</td>\n",
214
+ " <td>0.001144</td>\n",
215
+ " <td>0.000017</td>\n",
216
+ " <td>0.0</td>\n",
217
+ " <td>0.0</td>\n",
218
+ " </tr>\n",
219
+ " <tr>\n",
220
+ " <th>2</th>\n",
221
+ " <td>100B</td>\n",
222
+ " <td>0.608873</td>\n",
223
+ " <td>0.302822</td>\n",
224
+ " <td>0.074548</td>\n",
225
+ " <td>0.013757</td>\n",
226
+ " <td>0.0</td>\n",
227
+ " <td>0.0</td>\n",
228
+ " </tr>\n",
229
+ " <tr>\n",
230
+ " <th>3</th>\n",
231
+ " <td>350B</td>\n",
232
+ " <td>0.174147</td>\n",
233
+ " <td>0.30712</td>\n",
234
+ " <td>0.268018</td>\n",
235
+ " <td>0.250649</td>\n",
236
+ " <td>0.000065</td>\n",
237
+ " <td>0.0</td>\n",
238
+ " </tr>\n",
239
+ " <tr>\n",
240
+ " <th>4</th>\n",
241
+ " <td>1T</td>\n",
242
+ " <td>0.006232</td>\n",
243
+ " <td>0.03247</td>\n",
244
+ " <td>0.083743</td>\n",
245
+ " <td>0.817636</td>\n",
246
+ " <td>0.05991</td>\n",
247
+ " <td>0.000008</td>\n",
248
+ " </tr>\n",
249
+ " </tbody>\n",
250
+ "</table>\n",
251
+ "</div>"
252
+ ],
253
+ "text/plain": [
254
+ " index 1 2 3 4-8 8-16 16-32\n",
255
+ "0 1B 0.994974 0.005008 0.000018 0.0 0.0 0.0\n",
256
+ "1 10B 0.951508 0.047331 0.001144 0.000017 0.0 0.0\n",
257
+ "2 100B 0.608873 0.302822 0.074548 0.013757 0.0 0.0\n",
258
+ "3 350B 0.174147 0.30712 0.268018 0.250649 0.000065 0.0\n",
259
+ "4 1T 0.006232 0.03247 0.083743 0.817636 0.05991 0.000008"
260
+ ]
261
+ },
262
+ "execution_count": 28,
263
+ "metadata": {},
264
+ "output_type": "execute_result"
265
+ }
266
+ ],
267
+ "source": [
268
+ "summarized_df.reset_index()"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 57,
274
+ "metadata": {},
275
+ "outputs": [
276
+ {
277
+ "data": {
278
+ "text/plain": [
279
+ "{'default': {'file': 'default.json'}}"
280
+ ]
281
+ },
282
+ "execution_count": 57,
283
+ "metadata": {},
284
+ "output_type": "execute_result"
285
+ }
286
+ ],
287
+ "source": [
288
+ "import pandas as pd\n",
289
+ "\n",
290
+ "\n",
291
+ "df = pd.read_csv(\"./data/duplicates-simulation.csv\", index_col=0)\n",
292
+ "\n",
293
+ "\n",
294
+ "def summarize_ranges(df):\n",
295
+ " df_summarized = pd.DataFrame(\n",
296
+ " index=[\"1\", \"2\", \"3\", \"4-8\", \"8-16\", \"16-32\"], columns=df.columns\n",
297
+ " )\n",
298
+ " df_summarized.loc[\"1\"] = df.iloc[0]\n",
299
+ " df_summarized.loc[\"2\"] = df.iloc[1]\n",
300
+ " df_summarized.loc[\"3\"] = df.iloc[2]\n",
301
+ " df_summarized.loc[\"4-8\"] = df.iloc[3:9].sum()\n",
302
+ " df_summarized.loc[\"8-16\"] = df.iloc[9:17].sum()\n",
303
+ " df_summarized.loc[\"16-32\"] = df.iloc[17:].sum()\n",
304
+ " return df_summarized\n",
305
+ "\n",
306
+ "\n",
307
+ "summarized_df = summarize_ranges(df).T\n",
308
+ "cols = summarized_df.columns\n",
309
+ "summarized_df.reset_index(inplace=True)\n",
310
+ "save_for_plot(\n",
311
+ " \"duplicates-simul\",\n",
312
+ " summarized_df,\n",
313
+ " cols,\n",
314
+ " x_column=\"index\",\n",
315
+ " plot_name=\"Sampling from 1000 identical buckets with 200B tokens each\",\n",
316
+ " ylabel=\"Dataset fraction\",\n",
317
+ " xlabel=\"Sample size\",\n",
318
+ " default_metric=\"default\",\n",
319
+ " custom_layout={\n",
320
+ " \"barmode\": \"stack\",\n",
321
+ " \"legend\": {\n",
322
+ " \"title\": {\n",
323
+ " \"text\": \"# duplicates\",\n",
324
+ " \"font\": {\n",
325
+ " \"size\": 14,\n",
326
+ " \"weight\": \"bold\",\n",
327
+ " }\n",
328
+ " },\n",
329
+ " \"font\": {\n",
330
+ " \"size\": 14,\n",
331
+ " },\n",
332
+ " \"bgcolor\": 'rgba(255, 255, 255, 0.9)',\n",
333
+ " # \"borderwidth\": 1,\n",
334
+ " \"orientation\": \"v\",\n",
335
+ " \"xanchor\": \"left\",\n",
336
+ " \"yanchor\": \"bottom\",\n",
337
+ " \"x\": 0.01,\n",
338
+ " \"y\": 0,\n",
339
+ " },\n",
340
+ " },\n",
341
+ ")"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 17,
347
+ "metadata": {},
348
+ "outputs": [
349
+ {
350
+ "ename": "KeyError",
351
+ "evalue": "'index'",
352
+ "output_type": "error",
353
+ "traceback": [
354
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
355
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
356
+ "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
357
+ "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
358
+ "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
359
+ "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
360
+ "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
361
+ "\u001b[0;31mKeyError\u001b[0m: 'index'",
362
+ "\nThe above exception was the direct cause of the following exception:\n",
363
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
364
+ "Cell \u001b[0;32mIn[17], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Take the sumarized_df and pivot it\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mindex\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnum_duplicates\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mduplicates_prob\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
365
+ "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/frame.py:9326\u001b[0m, in \u001b[0;36mDataFrame.pivot\u001b[0;34m(self, columns, index, values)\u001b[0m\n\u001b[1;32m 9319\u001b[0m \u001b[38;5;129m@Substitution\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 9320\u001b[0m \u001b[38;5;129m@Appender\u001b[39m(_shared_docs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpivot\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 9321\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpivot\u001b[39m(\n\u001b[1;32m 9322\u001b[0m \u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m, columns, index\u001b[38;5;241m=\u001b[39mlib\u001b[38;5;241m.\u001b[39mno_default, values\u001b[38;5;241m=\u001b[39mlib\u001b[38;5;241m.\u001b[39mno_default\n\u001b[1;32m 9323\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[1;32m 9324\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreshape\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpivot\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pivot\n\u001b[0;32m-> 9326\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpivot\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m\n",
366
+ "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/reshape/pivot.py:553\u001b[0m, in \u001b[0;36mpivot\u001b[0;34m(data, columns, index, values)\u001b[0m\n\u001b[1;32m 549\u001b[0m index_list \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 550\u001b[0m data\u001b[38;5;241m.\u001b[39m_constructor_sliced(data\u001b[38;5;241m.\u001b[39mindex, name\u001b[38;5;241m=\u001b[39mdata\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 551\u001b[0m ]\n\u001b[1;32m 552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 553\u001b[0m index_list \u001b[38;5;241m=\u001b[39m [\u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[43midx\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m idx \u001b[38;5;129;01min\u001b[39;00m com\u001b[38;5;241m.\u001b[39mconvert_to_list_like(index)]\n\u001b[1;32m 555\u001b[0m data_columns \u001b[38;5;241m=\u001b[39m [data[col] \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m columns_listlike]\n\u001b[1;32m 556\u001b[0m index_list\u001b[38;5;241m.\u001b[39mextend(data_columns)\n",
367
+ "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/frame.py:4090\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4089\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4090\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4092\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
368
+ "File \u001b[0;32m~/.pyenv/versions/3.12.2/envs/datatrove/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
369
+ "\u001b[0;31mKeyError\u001b[0m: 'index'"
370
+ ]
371
+ }
372
+ ],
373
+ "source": [
374
+ "# Take the sumarized_df and pivotdf"
375
+ ]
376
+ },
377
+ {
378
+ "cell_type": "code",
379
+ "execution_count": 5,
380
+ "metadata": {},
381
+ "outputs": [
382
+ {
383
+ "data": {
384
+ "application/vnd.plotly.v1+json": {
385
+ "config": {
386
+ "plotlyServerURL": "https://plot.ly"
387
+ },
388
+ "data": [
389
+ {
390
+ "name": "1",
391
+ "type": "bar",
392
+ "x": [
393
+ "1B",
394
+ "10B",
395
+ "100B",
396
+ "350B",
397
+ "1T"
398
+ ],
399
+ "y": [
400
+ 0.994974,
401
+ 0.9515081,
402
+ 0.60887281,
403
+ 0.1741474885714285,
404
+ 0.006232416
405
+ ]
406
+ },
407
+ {
408
+ "name": "2",
409
+ "type": "bar",
410
+ "x": [
411
+ "1B",
412
+ "10B",
413
+ "100B",
414
+ "350B",
415
+ "1T"
416
+ ],
417
+ "y": [
418
+ 0.005008,
419
+ 0.047331,
420
+ 0.30282154,
421
+ 0.3071204342857143,
422
+ 0.032470074
423
+ ]
424
+ },
425
+ {
426
+ "name": "3",
427
+ "type": "bar",
428
+ "x": [
429
+ "1B",
430
+ "10B",
431
+ "100B",
432
+ "350B",
433
+ "1T"
434
+ ],
435
+ "y": [
436
+ 0.000018,
437
+ 0.0011439,
438
+ 0.0745482,
439
+ 0.2680183371428571,
440
+ 0.083742993
441
+ ]
442
+ },
443
+ {
444
+ "name": "4-8",
445
+ "type": "bar",
446
+ "x": [
447
+ "1B",
448
+ "10B",
449
+ "100B",
450
+ "350B",
451
+ "1T"
452
+ ],
453
+ "y": [
454
+ 0,
455
+ 0.000017,
456
+ 0.01375745,
457
+ 0.25064894285714273,
458
+ 0.8176358810000001
459
+ ]
460
+ },
461
+ {
462
+ "name": "8-16",
463
+ "type": "bar",
464
+ "x": [
465
+ "1B",
466
+ "10B",
467
+ "100B",
468
+ "350B",
469
+ "1T"
470
+ ],
471
+ "y": [
472
+ 0,
473
+ 0,
474
+ 0,
475
+ 0.00006479714285714286,
476
+ 0.05991048400000001
477
+ ]
478
+ },
479
+ {
480
+ "name": "16-32",
481
+ "type": "bar",
482
+ "x": [
483
+ "1B",
484
+ "10B",
485
+ "100B",
486
+ "350B",
487
+ "1T"
488
+ ],
489
+ "y": [
490
+ 0,
491
+ 0,
492
+ 0,
493
+ 0,
494
+ 0.000008152000000000001
495
+ ]
496
+ }
497
+ ],
498
+ "layout": {
499
+ "barmode": "stack",
500
+ "legend": {
501
+ "title": {
502
+ "text": "# duplicates"
503
+ }
504
+ },
505
+ "template": {
506
+ "data": {
507
+ "bar": [
508
+ {
509
+ "error_x": {
510
+ "color": "#2a3f5f"
511
+ },
512
+ "error_y": {
513
+ "color": "#2a3f5f"
514
+ },
515
+ "marker": {
516
+ "line": {
517
+ "color": "#E5ECF6",
518
+ "width": 0.5
519
+ },
520
+ "pattern": {
521
+ "fillmode": "overlay",
522
+ "size": 10,
523
+ "solidity": 0.2
524
+ }
525
+ },
526
+ "type": "bar"
527
+ }
528
+ ],
529
+ "barpolar": [
530
+ {
531
+ "marker": {
532
+ "line": {
533
+ "color": "#E5ECF6",
534
+ "width": 0.5
535
+ },
536
+ "pattern": {
537
+ "fillmode": "overlay",
538
+ "size": 10,
539
+ "solidity": 0.2
540
+ }
541
+ },
542
+ "type": "barpolar"
543
+ }
544
+ ],
545
+ "carpet": [
546
+ {
547
+ "aaxis": {
548
+ "endlinecolor": "#2a3f5f",
549
+ "gridcolor": "white",
550
+ "linecolor": "white",
551
+ "minorgridcolor": "white",
552
+ "startlinecolor": "#2a3f5f"
553
+ },
554
+ "baxis": {
555
+ "endlinecolor": "#2a3f5f",
556
+ "gridcolor": "white",
557
+ "linecolor": "white",
558
+ "minorgridcolor": "white",
559
+ "startlinecolor": "#2a3f5f"
560
+ },
561
+ "type": "carpet"
562
+ }
563
+ ],
564
+ "choropleth": [
565
+ {
566
+ "colorbar": {
567
+ "outlinewidth": 0,
568
+ "ticks": ""
569
+ },
570
+ "type": "choropleth"
571
+ }
572
+ ],
573
+ "contour": [
574
+ {
575
+ "colorbar": {
576
+ "outlinewidth": 0,
577
+ "ticks": ""
578
+ },
579
+ "colorscale": [
580
+ [
581
+ 0,
582
+ "#0d0887"
583
+ ],
584
+ [
585
+ 0.1111111111111111,
586
+ "#46039f"
587
+ ],
588
+ [
589
+ 0.2222222222222222,
590
+ "#7201a8"
591
+ ],
592
+ [
593
+ 0.3333333333333333,
594
+ "#9c179e"
595
+ ],
596
+ [
597
+ 0.4444444444444444,
598
+ "#bd3786"
599
+ ],
600
+ [
601
+ 0.5555555555555556,
602
+ "#d8576b"
603
+ ],
604
+ [
605
+ 0.6666666666666666,
606
+ "#ed7953"
607
+ ],
608
+ [
609
+ 0.7777777777777778,
610
+ "#fb9f3a"
611
+ ],
612
+ [
613
+ 0.8888888888888888,
614
+ "#fdca26"
615
+ ],
616
+ [
617
+ 1,
618
+ "#f0f921"
619
+ ]
620
+ ],
621
+ "type": "contour"
622
+ }
623
+ ],
624
+ "contourcarpet": [
625
+ {
626
+ "colorbar": {
627
+ "outlinewidth": 0,
628
+ "ticks": ""
629
+ },
630
+ "type": "contourcarpet"
631
+ }
632
+ ],
633
+ "heatmap": [
634
+ {
635
+ "colorbar": {
636
+ "outlinewidth": 0,
637
+ "ticks": ""
638
+ },
639
+ "colorscale": [
640
+ [
641
+ 0,
642
+ "#0d0887"
643
+ ],
644
+ [
645
+ 0.1111111111111111,
646
+ "#46039f"
647
+ ],
648
+ [
649
+ 0.2222222222222222,
650
+ "#7201a8"
651
+ ],
652
+ [
653
+ 0.3333333333333333,
654
+ "#9c179e"
655
+ ],
656
+ [
657
+ 0.4444444444444444,
658
+ "#bd3786"
659
+ ],
660
+ [
661
+ 0.5555555555555556,
662
+ "#d8576b"
663
+ ],
664
+ [
665
+ 0.6666666666666666,
666
+ "#ed7953"
667
+ ],
668
+ [
669
+ 0.7777777777777778,
670
+ "#fb9f3a"
671
+ ],
672
+ [
673
+ 0.8888888888888888,
674
+ "#fdca26"
675
+ ],
676
+ [
677
+ 1,
678
+ "#f0f921"
679
+ ]
680
+ ],
681
+ "type": "heatmap"
682
+ }
683
+ ],
684
+ "heatmapgl": [
685
+ {
686
+ "colorbar": {
687
+ "outlinewidth": 0,
688
+ "ticks": ""
689
+ },
690
+ "colorscale": [
691
+ [
692
+ 0,
693
+ "#0d0887"
694
+ ],
695
+ [
696
+ 0.1111111111111111,
697
+ "#46039f"
698
+ ],
699
+ [
700
+ 0.2222222222222222,
701
+ "#7201a8"
702
+ ],
703
+ [
704
+ 0.3333333333333333,
705
+ "#9c179e"
706
+ ],
707
+ [
708
+ 0.4444444444444444,
709
+ "#bd3786"
710
+ ],
711
+ [
712
+ 0.5555555555555556,
713
+ "#d8576b"
714
+ ],
715
+ [
716
+ 0.6666666666666666,
717
+ "#ed7953"
718
+ ],
719
+ [
720
+ 0.7777777777777778,
721
+ "#fb9f3a"
722
+ ],
723
+ [
724
+ 0.8888888888888888,
725
+ "#fdca26"
726
+ ],
727
+ [
728
+ 1,
729
+ "#f0f921"
730
+ ]
731
+ ],
732
+ "type": "heatmapgl"
733
+ }
734
+ ],
735
+ "histogram": [
736
+ {
737
+ "marker": {
738
+ "pattern": {
739
+ "fillmode": "overlay",
740
+ "size": 10,
741
+ "solidity": 0.2
742
+ }
743
+ },
744
+ "type": "histogram"
745
+ }
746
+ ],
747
+ "histogram2d": [
748
+ {
749
+ "colorbar": {
750
+ "outlinewidth": 0,
751
+ "ticks": ""
752
+ },
753
+ "colorscale": [
754
+ [
755
+ 0,
756
+ "#0d0887"
757
+ ],
758
+ [
759
+ 0.1111111111111111,
760
+ "#46039f"
761
+ ],
762
+ [
763
+ 0.2222222222222222,
764
+ "#7201a8"
765
+ ],
766
+ [
767
+ 0.3333333333333333,
768
+ "#9c179e"
769
+ ],
770
+ [
771
+ 0.4444444444444444,
772
+ "#bd3786"
773
+ ],
774
+ [
775
+ 0.5555555555555556,
776
+ "#d8576b"
777
+ ],
778
+ [
779
+ 0.6666666666666666,
780
+ "#ed7953"
781
+ ],
782
+ [
783
+ 0.7777777777777778,
784
+ "#fb9f3a"
785
+ ],
786
+ [
787
+ 0.8888888888888888,
788
+ "#fdca26"
789
+ ],
790
+ [
791
+ 1,
792
+ "#f0f921"
793
+ ]
794
+ ],
795
+ "type": "histogram2d"
796
+ }
797
+ ],
798
+ "histogram2dcontour": [
799
+ {
800
+ "colorbar": {
801
+ "outlinewidth": 0,
802
+ "ticks": ""
803
+ },
804
+ "colorscale": [
805
+ [
806
+ 0,
807
+ "#0d0887"
808
+ ],
809
+ [
810
+ 0.1111111111111111,
811
+ "#46039f"
812
+ ],
813
+ [
814
+ 0.2222222222222222,
815
+ "#7201a8"
816
+ ],
817
+ [
818
+ 0.3333333333333333,
819
+ "#9c179e"
820
+ ],
821
+ [
822
+ 0.4444444444444444,
823
+ "#bd3786"
824
+ ],
825
+ [
826
+ 0.5555555555555556,
827
+ "#d8576b"
828
+ ],
829
+ [
830
+ 0.6666666666666666,
831
+ "#ed7953"
832
+ ],
833
+ [
834
+ 0.7777777777777778,
835
+ "#fb9f3a"
836
+ ],
837
+ [
838
+ 0.8888888888888888,
839
+ "#fdca26"
840
+ ],
841
+ [
842
+ 1,
843
+ "#f0f921"
844
+ ]
845
+ ],
846
+ "type": "histogram2dcontour"
847
+ }
848
+ ],
849
+ "mesh3d": [
850
+ {
851
+ "colorbar": {
852
+ "outlinewidth": 0,
853
+ "ticks": ""
854
+ },
855
+ "type": "mesh3d"
856
+ }
857
+ ],
858
+ "parcoords": [
859
+ {
860
+ "line": {
861
+ "colorbar": {
862
+ "outlinewidth": 0,
863
+ "ticks": ""
864
+ }
865
+ },
866
+ "type": "parcoords"
867
+ }
868
+ ],
869
+ "pie": [
870
+ {
871
+ "automargin": true,
872
+ "type": "pie"
873
+ }
874
+ ],
875
+ "scatter": [
876
+ {
877
+ "fillpattern": {
878
+ "fillmode": "overlay",
879
+ "size": 10,
880
+ "solidity": 0.2
881
+ },
882
+ "type": "scatter"
883
+ }
884
+ ],
885
+ "scatter3d": [
886
+ {
887
+ "line": {
888
+ "colorbar": {
889
+ "outlinewidth": 0,
890
+ "ticks": ""
891
+ }
892
+ },
893
+ "marker": {
894
+ "colorbar": {
895
+ "outlinewidth": 0,
896
+ "ticks": ""
897
+ }
898
+ },
899
+ "type": "scatter3d"
900
+ }
901
+ ],
902
+ "scattercarpet": [
903
+ {
904
+ "marker": {
905
+ "colorbar": {
906
+ "outlinewidth": 0,
907
+ "ticks": ""
908
+ }
909
+ },
910
+ "type": "scattercarpet"
911
+ }
912
+ ],
913
+ "scattergeo": [
914
+ {
915
+ "marker": {
916
+ "colorbar": {
917
+ "outlinewidth": 0,
918
+ "ticks": ""
919
+ }
920
+ },
921
+ "type": "scattergeo"
922
+ }
923
+ ],
924
+ "scattergl": [
925
+ {
926
+ "marker": {
927
+ "colorbar": {
928
+ "outlinewidth": 0,
929
+ "ticks": ""
930
+ }
931
+ },
932
+ "type": "scattergl"
933
+ }
934
+ ],
935
+ "scattermapbox": [
936
+ {
937
+ "marker": {
938
+ "colorbar": {
939
+ "outlinewidth": 0,
940
+ "ticks": ""
941
+ }
942
+ },
943
+ "type": "scattermapbox"
944
+ }
945
+ ],
946
+ "scatterpolar": [
947
+ {
948
+ "marker": {
949
+ "colorbar": {
950
+ "outlinewidth": 0,
951
+ "ticks": ""
952
+ }
953
+ },
954
+ "type": "scatterpolar"
955
+ }
956
+ ],
957
+ "scatterpolargl": [
958
+ {
959
+ "marker": {
960
+ "colorbar": {
961
+ "outlinewidth": 0,
962
+ "ticks": ""
963
+ }
964
+ },
965
+ "type": "scatterpolargl"
966
+ }
967
+ ],
968
+ "scatterternary": [
969
+ {
970
+ "marker": {
971
+ "colorbar": {
972
+ "outlinewidth": 0,
973
+ "ticks": ""
974
+ }
975
+ },
976
+ "type": "scatterternary"
977
+ }
978
+ ],
979
+ "surface": [
980
+ {
981
+ "colorbar": {
982
+ "outlinewidth": 0,
983
+ "ticks": ""
984
+ },
985
+ "colorscale": [
986
+ [
987
+ 0,
988
+ "#0d0887"
989
+ ],
990
+ [
991
+ 0.1111111111111111,
992
+ "#46039f"
993
+ ],
994
+ [
995
+ 0.2222222222222222,
996
+ "#7201a8"
997
+ ],
998
+ [
999
+ 0.3333333333333333,
1000
+ "#9c179e"
1001
+ ],
1002
+ [
1003
+ 0.4444444444444444,
1004
+ "#bd3786"
1005
+ ],
1006
+ [
1007
+ 0.5555555555555556,
1008
+ "#d8576b"
1009
+ ],
1010
+ [
1011
+ 0.6666666666666666,
1012
+ "#ed7953"
1013
+ ],
1014
+ [
1015
+ 0.7777777777777778,
1016
+ "#fb9f3a"
1017
+ ],
1018
+ [
1019
+ 0.8888888888888888,
1020
+ "#fdca26"
1021
+ ],
1022
+ [
1023
+ 1,
1024
+ "#f0f921"
1025
+ ]
1026
+ ],
1027
+ "type": "surface"
1028
+ }
1029
+ ],
1030
+ "table": [
1031
+ {
1032
+ "cells": {
1033
+ "fill": {
1034
+ "color": "#EBF0F8"
1035
+ },
1036
+ "line": {
1037
+ "color": "white"
1038
+ }
1039
+ },
1040
+ "header": {
1041
+ "fill": {
1042
+ "color": "#C8D4E3"
1043
+ },
1044
+ "line": {
1045
+ "color": "white"
1046
+ }
1047
+ },
1048
+ "type": "table"
1049
+ }
1050
+ ]
1051
+ },
1052
+ "layout": {
1053
+ "annotationdefaults": {
1054
+ "arrowcolor": "#2a3f5f",
1055
+ "arrowhead": 0,
1056
+ "arrowwidth": 1
1057
+ },
1058
+ "autotypenumbers": "strict",
1059
+ "coloraxis": {
1060
+ "colorbar": {
1061
+ "outlinewidth": 0,
1062
+ "ticks": ""
1063
+ }
1064
+ },
1065
+ "colorscale": {
1066
+ "diverging": [
1067
+ [
1068
+ 0,
1069
+ "#8e0152"
1070
+ ],
1071
+ [
1072
+ 0.1,
1073
+ "#c51b7d"
1074
+ ],
1075
+ [
1076
+ 0.2,
1077
+ "#de77ae"
1078
+ ],
1079
+ [
1080
+ 0.3,
1081
+ "#f1b6da"
1082
+ ],
1083
+ [
1084
+ 0.4,
1085
+ "#fde0ef"
1086
+ ],
1087
+ [
1088
+ 0.5,
1089
+ "#f7f7f7"
1090
+ ],
1091
+ [
1092
+ 0.6,
1093
+ "#e6f5d0"
1094
+ ],
1095
+ [
1096
+ 0.7,
1097
+ "#b8e186"
1098
+ ],
1099
+ [
1100
+ 0.8,
1101
+ "#7fbc41"
1102
+ ],
1103
+ [
1104
+ 0.9,
1105
+ "#4d9221"
1106
+ ],
1107
+ [
1108
+ 1,
1109
+ "#276419"
1110
+ ]
1111
+ ],
1112
+ "sequential": [
1113
+ [
1114
+ 0,
1115
+ "#0d0887"
1116
+ ],
1117
+ [
1118
+ 0.1111111111111111,
1119
+ "#46039f"
1120
+ ],
1121
+ [
1122
+ 0.2222222222222222,
1123
+ "#7201a8"
1124
+ ],
1125
+ [
1126
+ 0.3333333333333333,
1127
+ "#9c179e"
1128
+ ],
1129
+ [
1130
+ 0.4444444444444444,
1131
+ "#bd3786"
1132
+ ],
1133
+ [
1134
+ 0.5555555555555556,
1135
+ "#d8576b"
1136
+ ],
1137
+ [
1138
+ 0.6666666666666666,
1139
+ "#ed7953"
1140
+ ],
1141
+ [
1142
+ 0.7777777777777778,
1143
+ "#fb9f3a"
1144
+ ],
1145
+ [
1146
+ 0.8888888888888888,
1147
+ "#fdca26"
1148
+ ],
1149
+ [
1150
+ 1,
1151
+ "#f0f921"
1152
+ ]
1153
+ ],
1154
+ "sequentialminus": [
1155
+ [
1156
+ 0,
1157
+ "#0d0887"
1158
+ ],
1159
+ [
1160
+ 0.1111111111111111,
1161
+ "#46039f"
1162
+ ],
1163
+ [
1164
+ 0.2222222222222222,
1165
+ "#7201a8"
1166
+ ],
1167
+ [
1168
+ 0.3333333333333333,
1169
+ "#9c179e"
1170
+ ],
1171
+ [
1172
+ 0.4444444444444444,
1173
+ "#bd3786"
1174
+ ],
1175
+ [
1176
+ 0.5555555555555556,
1177
+ "#d8576b"
1178
+ ],
1179
+ [
1180
+ 0.6666666666666666,
1181
+ "#ed7953"
1182
+ ],
1183
+ [
1184
+ 0.7777777777777778,
1185
+ "#fb9f3a"
1186
+ ],
1187
+ [
1188
+ 0.8888888888888888,
1189
+ "#fdca26"
1190
+ ],
1191
+ [
1192
+ 1,
1193
+ "#f0f921"
1194
+ ]
1195
+ ]
1196
+ },
1197
+ "colorway": [
1198
+ "#636efa",
1199
+ "#EF553B",
1200
+ "#00cc96",
1201
+ "#ab63fa",
1202
+ "#FFA15A",
1203
+ "#19d3f3",
1204
+ "#FF6692",
1205
+ "#B6E880",
1206
+ "#FF97FF",
1207
+ "#FECB52"
1208
+ ],
1209
+ "font": {
1210
+ "color": "#2a3f5f"
1211
+ },
1212
+ "geo": {
1213
+ "bgcolor": "white",
1214
+ "lakecolor": "white",
1215
+ "landcolor": "#E5ECF6",
1216
+ "showlakes": true,
1217
+ "showland": true,
1218
+ "subunitcolor": "white"
1219
+ },
1220
+ "hoverlabel": {
1221
+ "align": "left"
1222
+ },
1223
+ "hovermode": "closest",
1224
+ "mapbox": {
1225
+ "style": "light"
1226
+ },
1227
+ "paper_bgcolor": "white",
1228
+ "plot_bgcolor": "#E5ECF6",
1229
+ "polar": {
1230
+ "angularaxis": {
1231
+ "gridcolor": "white",
1232
+ "linecolor": "white",
1233
+ "ticks": ""
1234
+ },
1235
+ "bgcolor": "#E5ECF6",
1236
+ "radialaxis": {
1237
+ "gridcolor": "white",
1238
+ "linecolor": "white",
1239
+ "ticks": ""
1240
+ }
1241
+ },
1242
+ "scene": {
1243
+ "xaxis": {
1244
+ "backgroundcolor": "#E5ECF6",
1245
+ "gridcolor": "white",
1246
+ "gridwidth": 2,
1247
+ "linecolor": "white",
1248
+ "showbackground": true,
1249
+ "ticks": "",
1250
+ "zerolinecolor": "white"
1251
+ },
1252
+ "yaxis": {
1253
+ "backgroundcolor": "#E5ECF6",
1254
+ "gridcolor": "white",
1255
+ "gridwidth": 2,
1256
+ "linecolor": "white",
1257
+ "showbackground": true,
1258
+ "ticks": "",
1259
+ "zerolinecolor": "white"
1260
+ },
1261
+ "zaxis": {
1262
+ "backgroundcolor": "#E5ECF6",
1263
+ "gridcolor": "white",
1264
+ "gridwidth": 2,
1265
+ "linecolor": "white",
1266
+ "showbackground": true,
1267
+ "ticks": "",
1268
+ "zerolinecolor": "white"
1269
+ }
1270
+ },
1271
+ "shapedefaults": {
1272
+ "line": {
1273
+ "color": "#2a3f5f"
1274
+ }
1275
+ },
1276
+ "ternary": {
1277
+ "aaxis": {
1278
+ "gridcolor": "white",
1279
+ "linecolor": "white",
1280
+ "ticks": ""
1281
+ },
1282
+ "baxis": {
1283
+ "gridcolor": "white",
1284
+ "linecolor": "white",
1285
+ "ticks": ""
1286
+ },
1287
+ "bgcolor": "#E5ECF6",
1288
+ "caxis": {
1289
+ "gridcolor": "white",
1290
+ "linecolor": "white",
1291
+ "ticks": ""
1292
+ }
1293
+ },
1294
+ "title": {
1295
+ "x": 0.05
1296
+ },
1297
+ "xaxis": {
1298
+ "automargin": true,
1299
+ "gridcolor": "white",
1300
+ "linecolor": "white",
1301
+ "ticks": "",
1302
+ "title": {
1303
+ "standoff": 15
1304
+ },
1305
+ "zerolinecolor": "white",
1306
+ "zerolinewidth": 2
1307
+ },
1308
+ "yaxis": {
1309
+ "automargin": true,
1310
+ "gridcolor": "white",
1311
+ "linecolor": "white",
1312
+ "ticks": "",
1313
+ "title": {
1314
+ "standoff": 15
1315
+ },
1316
+ "zerolinecolor": "white",
1317
+ "zerolinewidth": 2
1318
+ }
1319
+ }
1320
+ },
1321
+ "title": {
1322
+ "text": "Sampling from 100 identical buckets with 200B tokens each"
1323
+ },
1324
+ "xaxis": {
1325
+ "title": {
1326
+ "text": "Sample size"
1327
+ }
1328
+ },
1329
+ "yaxis": {
1330
+ "range": [
1331
+ 0,
1332
+ 1.000001
1333
+ ],
1334
+ "title": {
1335
+ "text": "Dataset fraction"
1336
+ }
1337
+ }
1338
+ }
1339
+ }
1340
+ },
1341
+ "metadata": {},
1342
+ "output_type": "display_data"
1343
+ }
1344
+ ],
1345
+ "source": [
1346
+ "import pandas as pd\n",
1347
+ "import matplotlib.pyplot as plt\n",
1348
+ "\n",
1349
+ "import plotly.graph_objects as go\n",
1350
+ "\n",
1351
+ "df = pd.read_csv(\"./data/duplicates-simulation.csv\", index_col=0)\n",
1352
+ "\n",
1353
+ "def summarize_ranges(df):\n",
1354
+ " df_summarized = pd.DataFrame(index=['1', '2', '3', '4-8', '8-16', '16-32'], columns=df.columns)\n",
1355
+ " df_summarized.loc['1'] = df.iloc[0]\n",
1356
+ " df_summarized.loc['2'] = df.iloc[1]\n",
1357
+ " df_summarized.loc['3'] = df.iloc[2]\n",
1358
+ " df_summarized.loc['4-8'] = df.iloc[3:9].sum()\n",
1359
+ " df_summarized.loc['8-16'] = df.iloc[9:17].sum()\n",
1360
+ " df_summarized.loc['16-32'] = df.iloc[17:].sum()\n",
1361
+ " return df_summarized\n",
1362
+ "\n",
1363
+ "summarized_df = summarize_ranges(df).T\n",
1364
+ "\n",
1365
+ "# Create a stacked bar chart using Plotly\n",
1366
+ "fig = go.Figure()\n",
1367
+ "for col in summarized_df.columns:\n",
1368
+ " fig.add_trace(go.Bar(\n",
1369
+ " x=summarized_df.index,\n",
1370
+ " y=summarized_df[col],\n",
1371
+ " name=col\n",
1372
+ " ))\n",
1373
+ "\n",
1374
+ "fig.update_layout(\n",
1375
+ " barmode='stack',\n",
1376
+ " title_text=\"Sampling from 100 identical buckets with 200B tokens each\",\n",
1377
+ " xaxis_title=\"Sample size\",\n",
1378
+ " yaxis_title=\"Dataset fraction\",\n",
1379
+ " yaxis=dict(range=[0, 1.000001]),\n",
1380
+ " legend_title=\"# duplicates\",\n",
1381
+ ")\n",
1382
+ "\n",
1383
+ "fig.show()\n"
1384
+ ]
1385
+ },
1386
+ {
1387
+ "cell_type": "code",
1388
+ "execution_count": 3,
1389
+ "metadata": {},
1390
+ "outputs": [
1391
+ {
1392
+ "data": {
1393
+ "text/plain": [
1394
+ "Index(['1B', '10B', '100B', '350B', '1T'], dtype='object')"
1395
+ ]
1396
+ },
1397
+ "execution_count": 3,
1398
+ "metadata": {},
1399
+ "output_type": "execute_result"
1400
+ }
1401
+ ],
1402
+ "source": [
1403
+ "summarized_df.index"
1404
+ ]
1405
+ }
1406
+ ],
1407
+ "metadata": {
1408
+ "kernelspec": {
1409
+ "display_name": "datatrove",
1410
+ "language": "python",
1411
+ "name": "python3"
1412
+ },
1413
+ "language_info": {
1414
+ "name": "python",
1415
+ "version": "3.12.2"
1416
+ }
1417
+ },
1418
+ "nbformat": 4,
1419
+ "nbformat_minor": 2
1420
+ }
notebooks/plot_histograms_cross.ipynb ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": []
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from collections import defaultdict\n",
17
+ "\n",
18
+ "import pandas as pd\n",
19
+ "\n",
20
+ "\n",
21
+ "def get_setting(name):\n",
22
+ " if \"terminal-punct\" in name:\n",
23
+ " return {\"x\": \"Fraction of lines ended with punctuation\", \"ylim\": (0, 0.1)}\n",
24
+ " \n",
25
+ " if \"line-dedup\" in name:\n",
26
+ " return {\"x\": \"Fraction of chars in duplicated lines\", \"xlim\": (0, 0.1), \"ylim\": (0,0.02)}\n",
27
+ " \n",
28
+ " if \"short-line\" in name:\n",
29
+ " return {\"x\": \"Fraction of lines shorter than 30 chars\", \"xlim\": (0.4, 1.0), \"ylim\": (0,0.05)}\n",
30
+ " \n",
31
+ " if \"avg_words_per_line\" in name:\n",
32
+ " return {\"x\": \"Avg. words per line\", \"x-log\": True, \"x-log\": True, \"round\": 0}\n",
33
+ " if \"avg_line_length\" in name:\n",
34
+ " return {\"x\": \"Avg. words per line\", \"x-log\": True, \"round\": 0}\n",
35
+ " \n",
36
+ " if \"global-length.json\" == name:\n",
37
+ " return {\"x\": \"Num. UTF-8 chars\", \"x-log\": True}\n",
38
+ " \n",
39
+ " if \"global-digit_ratio.json\" == name:\n",
40
+ " return {\"x\": \"Digit ratio\", \"xlim\": (0, 0.25)}\n",
41
+ " \n",
42
+ " if \"global-avg_word_length.json\" == name:\n",
43
+ " return {\"x\": \"Avg. word length\", \"xlim\": (2.5, 6.5)}\n",
44
+ "\n",
45
+ " \n",
46
+ " raise ValueError(f\"Unknown dataset name: {name}\")\n",
47
+ "\n",
48
+ "\n",
49
+ "def plot_scatter(data):\n",
50
+ " \"\"\"\n",
51
+ " Plot scatter plots with smoothing for each dataset in the data list on a single grid.\n",
52
+ " Each dataset is expected to be a dictionary with the first key as the dataset name,\n",
53
+ " and the value as another dictionary where keys are data points and values are their counts.\n",
54
+ " \"\"\"\n",
55
+ " import matplotlib.pyplot as plt\n",
56
+ " import numpy as np\n",
57
+ "\n",
58
+ " # Determine the number of plots and create a subplot grid\n",
59
+ " num_datasets = len(data)\n",
60
+ " cols = 2 # Define number of columns in the grid\n",
61
+ " rows = (num_datasets) // cols # Calculate the required number of rows\n",
62
+ " fig, axs = plt.subplots(rows, cols, figsize=(8 * cols, 3 * rows), dpi=350)\n",
63
+ " if rows * cols > 1:\n",
64
+ " axs = axs.flatten() # Flatten the array of axes if more than one subplot\n",
65
+ " else:\n",
66
+ " axs = [axs] # Encapsulate the single AxesSubplot object into a list for uniform handling\n",
67
+ "\n",
68
+ " plot_index = 0\n",
69
+ " legend_handles = [] # List to store handles for the legend\n",
70
+ " legend_labels = [] # List to store labels for the legend\n",
71
+ " for name, dataset in data.items():\n",
72
+ " setting = get_setting(name)\n",
73
+ " ax = axs[plot_index]\n",
74
+ " if \"name\" in setting:\n",
75
+ " ax.set_title(setting[\"name\"])\n",
76
+ " if \"x\" in setting:\n",
77
+ " ax.set_xlabel(setting[\"x\"])\n",
78
+ " if \"xlim\" in setting:\n",
79
+ " ax.set_xlim(setting[\"xlim\"])\n",
80
+ " if \"ylim\" in setting:\n",
81
+ " ax.set_ylim(setting[\"ylim\"])\n",
82
+ " if \"x-log\" in setting:\n",
83
+ " ax.set_xscale('log')\n",
84
+ "\n",
85
+ " # Use 2 decimal places for the y-axis labels\n",
86
+ " ax.yaxis.set_major_formatter('{x:.3f}')\n",
87
+ "\n",
88
+ "\n",
89
+ " plot_index += 1\n",
90
+ " # Each dataset may contain multiple lines\n",
91
+ " for i, (line_name, line_data) in enumerate(dataset.items()):\n",
92
+ " if \"round\" in setting:\n",
93
+ " tmp_line_data = defaultdict(list)\n",
94
+ " for p, p_v in line_data.items():\n",
95
+ " rounded_key = str(round(float(p), setting[\"round\"]))\n",
96
+ " tmp_line_data[rounded_key].append(p_v)\n",
97
+ "\n",
98
+ " # If you want to sum the values that have the same rounded key\n",
99
+ " tmp_line_data = {k: sum(v) for k, v in tmp_line_data.items()}\n",
100
+ " line_data = tmp_line_data\n",
101
+ " \n",
102
+ " # Check that if you sum the values you get 1\n",
103
+ " assert sum(line_data.values()) == 1\n",
104
+ "\n",
105
+ " # Add smoothing for 4-5 points\n",
106
+ " # Implementing smoothing using a rolling window\n",
107
+ " line_name = rename_dataset(line_name)\n",
108
+ " # Sorting the line data by keys\n",
109
+ " sorted_line_data = dict(sorted(line_data.items(), key=lambda item: float(item[0])))\n",
110
+ "\n",
111
+ " window_size = setting.get(\"window_size\", 5) # Define the window size for smoothing\n",
112
+ " x = np.array(list(sorted_line_data.keys()), dtype=float)\n",
113
+ " y = np.array(list(sorted_line_data.values()), dtype=float)\n",
114
+ " if len(y) >= window_size: # Ensure there are enough points to apply smoothing\n",
115
+ " # Convert y to a pandas Series to use rolling function\n",
116
+ " y_series = pd.Series(y)\n",
117
+ " # Apply rolling window and mean to smooth the data\n",
118
+ " y_smoothed = y_series.rolling(window=window_size).mean()\n",
119
+ " # Drop NaN values that result from the rolling mean calculation\n",
120
+ " y_smoothed = y_smoothed.dropna()\n",
121
+ " # Update x to correspond to the length of the smoothed y\n",
122
+ " x = x[len(x) - len(y_smoothed):]\n",
123
+ " y = y_smoothed.to_numpy() # Convert back to numpy array for plotting\n",
124
+ "\n",
125
+ "\n",
126
+ "\n",
127
+ " # Use the line name as the label to unify same line names across different plots\n",
128
+ "\n",
129
+ " line, = ax.plot(x, y, label=line_name) # Use default colors\n",
130
+ " if line_name not in legend_labels:\n",
131
+ " legend_handles.append(line)\n",
132
+ " legend_labels.append(line_name)\n",
133
+ "\n",
134
+ " # Place a single shared legend on the top of the figure\n",
135
+ " fig.legend(handles=legend_handles, labels=legend_labels, loc='lower center', ncol=1)\n",
136
+ " for ax in axs:\n",
137
+ " ax.set_ylabel('Document Frequency')\n",
138
+ "\n",
139
+ " fig.suptitle(\"Histograms of selected statistics\")\n",
140
+ " plt.tight_layout(rect=[0, 0.15, 1, 1]) # Adjust the layout to make room for the legend\n",
141
+ " fig.set_size_inches(13, 6) # Set the figure size to 18 inches by 12 inches\n",
142
+ " plt.show()\n",
143
+ "\n",
144
+ "plot_scatter(data)\n"
145
+ ]
146
+ }
147
+ ],
148
+ "metadata": {
149
+ "language_info": {
150
+ "name": "python"
151
+ }
152
+ },
153
+ "nbformat": 4,
154
+ "nbformat_minor": 2
155
+ }
notebooks/plot_removed_data_dedup.ipynb ADDED
@@ -0,0 +1,1578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 23,
6
+ "id": "138889b92720ce2e",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2024-04-30T13:28:07.130909Z",
10
+ "start_time": "2024-04-30T13:28:06.470042Z"
11
+ },
12
+ "collapsed": false
13
+ },
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/html": [
18
+ "<div>\n",
19
+ "<style scoped>\n",
20
+ " .dataframe tbody tr th:only-of-type {\n",
21
+ " vertical-align: middle;\n",
22
+ " }\n",
23
+ "\n",
24
+ " .dataframe tbody tr th {\n",
25
+ " vertical-align: top;\n",
26
+ " }\n",
27
+ "\n",
28
+ " .dataframe thead th {\n",
29
+ " text-align: right;\n",
30
+ " }\n",
31
+ "</style>\n",
32
+ "<table border=\"1\" class=\"dataframe\">\n",
33
+ " <thead>\n",
34
+ " <tr style=\"text-align: right;\">\n",
35
+ " <th></th>\n",
36
+ " <th>runname</th>\n",
37
+ " <th>seed</th>\n",
38
+ " <th>steps</th>\n",
39
+ " <th>agg_score</th>\n",
40
+ " <th>commonsense_qa/acc</th>\n",
41
+ " <th>commonsense_qa/acc_norm</th>\n",
42
+ " <th>hellaswag/acc</th>\n",
43
+ " <th>hellaswag/acc_norm</th>\n",
44
+ " <th>openbookqa/acc</th>\n",
45
+ " <th>openbookqa/acc_norm</th>\n",
46
+ " <th>...</th>\n",
47
+ " <th>siqa/acc</th>\n",
48
+ " <th>siqa/acc_norm</th>\n",
49
+ " <th>winogrande/acc</th>\n",
50
+ " <th>winogrande/acc_norm</th>\n",
51
+ " <th>sciq/acc</th>\n",
52
+ " <th>sciq/acc_norm</th>\n",
53
+ " <th>arc/acc</th>\n",
54
+ " <th>arc/acc_norm</th>\n",
55
+ " <th>mmlu/acc</th>\n",
56
+ " <th>mmlu/acc_norm</th>\n",
57
+ " </tr>\n",
58
+ " </thead>\n",
59
+ " <tbody>\n",
60
+ " <tr>\n",
61
+ " <th>0</th>\n",
62
+ " <td>deduped_removed_cross</td>\n",
63
+ " <td>5</td>\n",
64
+ " <td>0</td>\n",
65
+ " <td>0.330893</td>\n",
66
+ " <td>0.186</td>\n",
67
+ " <td>0.233</td>\n",
68
+ " <td>0.272</td>\n",
69
+ " <td>0.258</td>\n",
70
+ " <td>0.166</td>\n",
71
+ " <td>0.286</td>\n",
72
+ " <td>...</td>\n",
73
+ " <td>0.367</td>\n",
74
+ " <td>0.362</td>\n",
75
+ " <td>0.516</td>\n",
76
+ " <td>0.497</td>\n",
77
+ " <td>0.208</td>\n",
78
+ " <td>0.202</td>\n",
79
+ " <td>0.2195</td>\n",
80
+ " <td>0.2510</td>\n",
81
+ " <td>0.230294</td>\n",
82
+ " <td>0.250147</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>1</th>\n",
86
+ " <td>deduped_removed_cross</td>\n",
87
+ " <td>5</td>\n",
88
+ " <td>1000</td>\n",
89
+ " <td>0.354090</td>\n",
90
+ " <td>0.253</td>\n",
91
+ " <td>0.257</td>\n",
92
+ " <td>0.290</td>\n",
93
+ " <td>0.278</td>\n",
94
+ " <td>0.124</td>\n",
95
+ " <td>0.264</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>0.368</td>\n",
98
+ " <td>0.389</td>\n",
99
+ " <td>0.509</td>\n",
100
+ " <td>0.491</td>\n",
101
+ " <td>0.582</td>\n",
102
+ " <td>0.516</td>\n",
103
+ " <td>0.2825</td>\n",
104
+ " <td>0.2955</td>\n",
105
+ " <td>0.239520</td>\n",
106
+ " <td>0.253223</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>deduped_removed_cross</td>\n",
111
+ " <td>5</td>\n",
112
+ " <td>2000</td>\n",
113
+ " <td>0.373601</td>\n",
114
+ " <td>0.274</td>\n",
115
+ " <td>0.290</td>\n",
116
+ " <td>0.313</td>\n",
117
+ " <td>0.312</td>\n",
118
+ " <td>0.116</td>\n",
119
+ " <td>0.258</td>\n",
120
+ " <td>...</td>\n",
121
+ " <td>0.367</td>\n",
122
+ " <td>0.397</td>\n",
123
+ " <td>0.516</td>\n",
124
+ " <td>0.505</td>\n",
125
+ " <td>0.686</td>\n",
126
+ " <td>0.582</td>\n",
127
+ " <td>0.3090</td>\n",
128
+ " <td>0.3200</td>\n",
129
+ " <td>0.247320</td>\n",
130
+ " <td>0.262812</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>3</th>\n",
134
+ " <td>deduped_removed_cross</td>\n",
135
+ " <td>5</td>\n",
136
+ " <td>3000</td>\n",
137
+ " <td>0.383122</td>\n",
138
+ " <td>0.306</td>\n",
139
+ " <td>0.292</td>\n",
140
+ " <td>0.323</td>\n",
141
+ " <td>0.335</td>\n",
142
+ " <td>0.150</td>\n",
143
+ " <td>0.278</td>\n",
144
+ " <td>...</td>\n",
145
+ " <td>0.371</td>\n",
146
+ " <td>0.401</td>\n",
147
+ " <td>0.513</td>\n",
148
+ " <td>0.500</td>\n",
149
+ " <td>0.712</td>\n",
150
+ " <td>0.611</td>\n",
151
+ " <td>0.3075</td>\n",
152
+ " <td>0.3415</td>\n",
153
+ " <td>0.248568</td>\n",
154
+ " <td>0.263474</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>4</th>\n",
158
+ " <td>deduped_removed_cross</td>\n",
159
+ " <td>5</td>\n",
160
+ " <td>4000</td>\n",
161
+ " <td>0.390222</td>\n",
162
+ " <td>0.300</td>\n",
163
+ " <td>0.292</td>\n",
164
+ " <td>0.324</td>\n",
165
+ " <td>0.351</td>\n",
166
+ " <td>0.144</td>\n",
167
+ " <td>0.278</td>\n",
168
+ " <td>...</td>\n",
169
+ " <td>0.386</td>\n",
170
+ " <td>0.395</td>\n",
171
+ " <td>0.511</td>\n",
172
+ " <td>0.511</td>\n",
173
+ " <td>0.750</td>\n",
174
+ " <td>0.658</td>\n",
175
+ " <td>0.3260</td>\n",
176
+ " <td>0.3445</td>\n",
177
+ " <td>0.259246</td>\n",
178
+ " <td>0.273276</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>5</th>\n",
182
+ " <td>deduped_removed_cross</td>\n",
183
+ " <td>5</td>\n",
184
+ " <td>5000</td>\n",
185
+ " <td>0.400239</td>\n",
186
+ " <td>0.322</td>\n",
187
+ " <td>0.308</td>\n",
188
+ " <td>0.325</td>\n",
189
+ " <td>0.364</td>\n",
190
+ " <td>0.172</td>\n",
191
+ " <td>0.298</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>0.382</td>\n",
194
+ " <td>0.398</td>\n",
195
+ " <td>0.518</td>\n",
196
+ " <td>0.522</td>\n",
197
+ " <td>0.751</td>\n",
198
+ " <td>0.661</td>\n",
199
+ " <td>0.3470</td>\n",
200
+ " <td>0.3545</td>\n",
201
+ " <td>0.258485</td>\n",
202
+ " <td>0.271414</td>\n",
203
+ " </tr>\n",
204
+ " <tr>\n",
205
+ " <th>6</th>\n",
206
+ " <td>deduped_removed_cross</td>\n",
207
+ " <td>5</td>\n",
208
+ " <td>6000</td>\n",
209
+ " <td>0.401484</td>\n",
210
+ " <td>0.315</td>\n",
211
+ " <td>0.314</td>\n",
212
+ " <td>0.341</td>\n",
213
+ " <td>0.372</td>\n",
214
+ " <td>0.162</td>\n",
215
+ " <td>0.314</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>0.377</td>\n",
218
+ " <td>0.390</td>\n",
219
+ " <td>0.498</td>\n",
220
+ " <td>0.492</td>\n",
221
+ " <td>0.776</td>\n",
222
+ " <td>0.669</td>\n",
223
+ " <td>0.3530</td>\n",
224
+ " <td>0.3565</td>\n",
225
+ " <td>0.261842</td>\n",
226
+ " <td>0.276371</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>7</th>\n",
230
+ " <td>deduped_removed_cross</td>\n",
231
+ " <td>5</td>\n",
232
+ " <td>7000</td>\n",
233
+ " <td>0.403533</td>\n",
234
+ " <td>0.324</td>\n",
235
+ " <td>0.315</td>\n",
236
+ " <td>0.350</td>\n",
237
+ " <td>0.386</td>\n",
238
+ " <td>0.188</td>\n",
239
+ " <td>0.298</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>0.376</td>\n",
242
+ " <td>0.384</td>\n",
243
+ " <td>0.518</td>\n",
244
+ " <td>0.521</td>\n",
245
+ " <td>0.769</td>\n",
246
+ " <td>0.672</td>\n",
247
+ " <td>0.3625</td>\n",
248
+ " <td>0.3585</td>\n",
249
+ " <td>0.265558</td>\n",
250
+ " <td>0.274768</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>8</th>\n",
254
+ " <td>deduped_removed_cross</td>\n",
255
+ " <td>5</td>\n",
256
+ " <td>8000</td>\n",
257
+ " <td>0.411774</td>\n",
258
+ " <td>0.344</td>\n",
259
+ " <td>0.313</td>\n",
260
+ " <td>0.352</td>\n",
261
+ " <td>0.409</td>\n",
262
+ " <td>0.170</td>\n",
263
+ " <td>0.310</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>0.374</td>\n",
266
+ " <td>0.391</td>\n",
267
+ " <td>0.530</td>\n",
268
+ " <td>0.521</td>\n",
269
+ " <td>0.781</td>\n",
270
+ " <td>0.677</td>\n",
271
+ " <td>0.3530</td>\n",
272
+ " <td>0.3615</td>\n",
273
+ " <td>0.267141</td>\n",
274
+ " <td>0.283691</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>9</th>\n",
278
+ " <td>deduped_removed_cross</td>\n",
279
+ " <td>5</td>\n",
280
+ " <td>9000</td>\n",
281
+ " <td>0.410993</td>\n",
282
+ " <td>0.335</td>\n",
283
+ " <td>0.322</td>\n",
284
+ " <td>0.361</td>\n",
285
+ " <td>0.404</td>\n",
286
+ " <td>0.182</td>\n",
287
+ " <td>0.294</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>0.374</td>\n",
290
+ " <td>0.391</td>\n",
291
+ " <td>0.526</td>\n",
292
+ " <td>0.514</td>\n",
293
+ " <td>0.769</td>\n",
294
+ " <td>0.672</td>\n",
295
+ " <td>0.3630</td>\n",
296
+ " <td>0.3715</td>\n",
297
+ " <td>0.266464</td>\n",
298
+ " <td>0.284446</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>10</th>\n",
302
+ " <td>deduped_removed_cross</td>\n",
303
+ " <td>5</td>\n",
304
+ " <td>10000</td>\n",
305
+ " <td>0.417883</td>\n",
306
+ " <td>0.330</td>\n",
307
+ " <td>0.320</td>\n",
308
+ " <td>0.370</td>\n",
309
+ " <td>0.417</td>\n",
310
+ " <td>0.192</td>\n",
311
+ " <td>0.324</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>0.389</td>\n",
314
+ " <td>0.389</td>\n",
315
+ " <td>0.518</td>\n",
316
+ " <td>0.524</td>\n",
317
+ " <td>0.785</td>\n",
318
+ " <td>0.682</td>\n",
319
+ " <td>0.3735</td>\n",
320
+ " <td>0.3745</td>\n",
321
+ " <td>0.268085</td>\n",
322
+ " <td>0.283562</td>\n",
323
+ " </tr>\n",
324
+ " <tr>\n",
325
+ " <th>11</th>\n",
326
+ " <td>deduped_removed_cross</td>\n",
327
+ " <td>5</td>\n",
328
+ " <td>11000</td>\n",
329
+ " <td>0.422325</td>\n",
330
+ " <td>0.332</td>\n",
331
+ " <td>0.328</td>\n",
332
+ " <td>0.366</td>\n",
333
+ " <td>0.426</td>\n",
334
+ " <td>0.188</td>\n",
335
+ " <td>0.320</td>\n",
336
+ " <td>...</td>\n",
337
+ " <td>0.398</td>\n",
338
+ " <td>0.397</td>\n",
339
+ " <td>0.535</td>\n",
340
+ " <td>0.529</td>\n",
341
+ " <td>0.801</td>\n",
342
+ " <td>0.695</td>\n",
343
+ " <td>0.3775</td>\n",
344
+ " <td>0.3800</td>\n",
345
+ " <td>0.267457</td>\n",
346
+ " <td>0.285596</td>\n",
347
+ " </tr>\n",
348
+ " <tr>\n",
349
+ " <th>12</th>\n",
350
+ " <td>deduped_removed_cross</td>\n",
351
+ " <td>5</td>\n",
352
+ " <td>12000</td>\n",
353
+ " <td>0.420167</td>\n",
354
+ " <td>0.348</td>\n",
355
+ " <td>0.324</td>\n",
356
+ " <td>0.364</td>\n",
357
+ " <td>0.434</td>\n",
358
+ " <td>0.194</td>\n",
359
+ " <td>0.306</td>\n",
360
+ " <td>...</td>\n",
361
+ " <td>0.377</td>\n",
362
+ " <td>0.392</td>\n",
363
+ " <td>0.541</td>\n",
364
+ " <td>0.527</td>\n",
365
+ " <td>0.790</td>\n",
366
+ " <td>0.690</td>\n",
367
+ " <td>0.3680</td>\n",
368
+ " <td>0.3755</td>\n",
369
+ " <td>0.267547</td>\n",
370
+ " <td>0.285836</td>\n",
371
+ " </tr>\n",
372
+ " <tr>\n",
373
+ " <th>13</th>\n",
374
+ " <td>deduped_removed_cross</td>\n",
375
+ " <td>5</td>\n",
376
+ " <td>13000</td>\n",
377
+ " <td>0.422913</td>\n",
378
+ " <td>0.346</td>\n",
379
+ " <td>0.330</td>\n",
380
+ " <td>0.372</td>\n",
381
+ " <td>0.438</td>\n",
382
+ " <td>0.190</td>\n",
383
+ " <td>0.320</td>\n",
384
+ " <td>...</td>\n",
385
+ " <td>0.392</td>\n",
386
+ " <td>0.396</td>\n",
387
+ " <td>0.540</td>\n",
388
+ " <td>0.522</td>\n",
389
+ " <td>0.802</td>\n",
390
+ " <td>0.707</td>\n",
391
+ " <td>0.3760</td>\n",
392
+ " <td>0.3845</td>\n",
393
+ " <td>0.271108</td>\n",
394
+ " <td>0.287802</td>\n",
395
+ " </tr>\n",
396
+ " <tr>\n",
397
+ " <th>14</th>\n",
398
+ " <td>deduped_removed_cross</td>\n",
399
+ " <td>5</td>\n",
400
+ " <td>13500</td>\n",
401
+ " <td>0.421868</td>\n",
402
+ " <td>0.345</td>\n",
403
+ " <td>0.322</td>\n",
404
+ " <td>0.370</td>\n",
405
+ " <td>0.431</td>\n",
406
+ " <td>0.202</td>\n",
407
+ " <td>0.330</td>\n",
408
+ " <td>...</td>\n",
409
+ " <td>0.387</td>\n",
410
+ " <td>0.392</td>\n",
411
+ " <td>0.540</td>\n",
412
+ " <td>0.516</td>\n",
413
+ " <td>0.797</td>\n",
414
+ " <td>0.700</td>\n",
415
+ " <td>0.3790</td>\n",
416
+ " <td>0.3870</td>\n",
417
+ " <td>0.269510</td>\n",
418
+ " <td>0.287944</td>\n",
419
+ " </tr>\n",
420
+ " <tr>\n",
421
+ " <th>15</th>\n",
422
+ " <td>deduped_removed_cross</td>\n",
423
+ " <td>6</td>\n",
424
+ " <td>0</td>\n",
425
+ " <td>0.330893</td>\n",
426
+ " <td>0.186</td>\n",
427
+ " <td>0.233</td>\n",
428
+ " <td>0.272</td>\n",
429
+ " <td>0.258</td>\n",
430
+ " <td>0.166</td>\n",
431
+ " <td>0.286</td>\n",
432
+ " <td>...</td>\n",
433
+ " <td>0.367</td>\n",
434
+ " <td>0.362</td>\n",
435
+ " <td>0.516</td>\n",
436
+ " <td>0.497</td>\n",
437
+ " <td>0.208</td>\n",
438
+ " <td>0.202</td>\n",
439
+ " <td>0.2195</td>\n",
440
+ " <td>0.2510</td>\n",
441
+ " <td>0.230294</td>\n",
442
+ " <td>0.250147</td>\n",
443
+ " </tr>\n",
444
+ " <tr>\n",
445
+ " <th>16</th>\n",
446
+ " <td>deduped_removed_cross</td>\n",
447
+ " <td>6</td>\n",
448
+ " <td>1000</td>\n",
449
+ " <td>0.360039</td>\n",
450
+ " <td>0.236</td>\n",
451
+ " <td>0.259</td>\n",
452
+ " <td>0.283</td>\n",
453
+ " <td>0.277</td>\n",
454
+ " <td>0.130</td>\n",
455
+ " <td>0.274</td>\n",
456
+ " <td>...</td>\n",
457
+ " <td>0.354</td>\n",
458
+ " <td>0.386</td>\n",
459
+ " <td>0.509</td>\n",
460
+ " <td>0.507</td>\n",
461
+ " <td>0.559</td>\n",
462
+ " <td>0.500</td>\n",
463
+ " <td>0.2590</td>\n",
464
+ " <td>0.2970</td>\n",
465
+ " <td>0.243455</td>\n",
466
+ " <td>0.254311</td>\n",
467
+ " </tr>\n",
468
+ " <tr>\n",
469
+ " <th>17</th>\n",
470
+ " <td>deduped_removed_cross</td>\n",
471
+ " <td>6</td>\n",
472
+ " <td>2000</td>\n",
473
+ " <td>0.371564</td>\n",
474
+ " <td>0.270</td>\n",
475
+ " <td>0.283</td>\n",
476
+ " <td>0.303</td>\n",
477
+ " <td>0.305</td>\n",
478
+ " <td>0.132</td>\n",
479
+ " <td>0.280</td>\n",
480
+ " <td>...</td>\n",
481
+ " <td>0.377</td>\n",
482
+ " <td>0.392</td>\n",
483
+ " <td>0.522</td>\n",
484
+ " <td>0.504</td>\n",
485
+ " <td>0.665</td>\n",
486
+ " <td>0.566</td>\n",
487
+ " <td>0.3040</td>\n",
488
+ " <td>0.3135</td>\n",
489
+ " <td>0.249051</td>\n",
490
+ " <td>0.255010</td>\n",
491
+ " </tr>\n",
492
+ " <tr>\n",
493
+ " <th>18</th>\n",
494
+ " <td>deduped_removed_cross</td>\n",
495
+ " <td>6</td>\n",
496
+ " <td>3000</td>\n",
497
+ " <td>0.383770</td>\n",
498
+ " <td>0.283</td>\n",
499
+ " <td>0.286</td>\n",
500
+ " <td>0.323</td>\n",
501
+ " <td>0.320</td>\n",
502
+ " <td>0.156</td>\n",
503
+ " <td>0.296</td>\n",
504
+ " <td>...</td>\n",
505
+ " <td>0.375</td>\n",
506
+ " <td>0.394</td>\n",
507
+ " <td>0.503</td>\n",
508
+ " <td>0.497</td>\n",
509
+ " <td>0.721</td>\n",
510
+ " <td>0.626</td>\n",
511
+ " <td>0.3140</td>\n",
512
+ " <td>0.3410</td>\n",
513
+ " <td>0.254015</td>\n",
514
+ " <td>0.266158</td>\n",
515
+ " </tr>\n",
516
+ " <tr>\n",
517
+ " <th>19</th>\n",
518
+ " <td>deduped_removed_cross</td>\n",
519
+ " <td>6</td>\n",
520
+ " <td>4000</td>\n",
521
+ " <td>0.391082</td>\n",
522
+ " <td>0.293</td>\n",
523
+ " <td>0.298</td>\n",
524
+ " <td>0.339</td>\n",
525
+ " <td>0.361</td>\n",
526
+ " <td>0.160</td>\n",
527
+ " <td>0.292</td>\n",
528
+ " <td>...</td>\n",
529
+ " <td>0.380</td>\n",
530
+ " <td>0.399</td>\n",
531
+ " <td>0.505</td>\n",
532
+ " <td>0.494</td>\n",
533
+ " <td>0.719</td>\n",
534
+ " <td>0.615</td>\n",
535
+ " <td>0.3375</td>\n",
536
+ " <td>0.3375</td>\n",
537
+ " <td>0.256696</td>\n",
538
+ " <td>0.268152</td>\n",
539
+ " </tr>\n",
540
+ " <tr>\n",
541
+ " <th>20</th>\n",
542
+ " <td>deduped_removed_cross</td>\n",
543
+ " <td>6</td>\n",
544
+ " <td>5000</td>\n",
545
+ " <td>0.399130</td>\n",
546
+ " <td>0.309</td>\n",
547
+ " <td>0.311</td>\n",
548
+ " <td>0.343</td>\n",
549
+ " <td>0.376</td>\n",
550
+ " <td>0.160</td>\n",
551
+ " <td>0.286</td>\n",
552
+ " <td>...</td>\n",
553
+ " <td>0.392</td>\n",
554
+ " <td>0.401</td>\n",
555
+ " <td>0.525</td>\n",
556
+ " <td>0.512</td>\n",
557
+ " <td>0.733</td>\n",
558
+ " <td>0.639</td>\n",
559
+ " <td>0.3390</td>\n",
560
+ " <td>0.3580</td>\n",
561
+ " <td>0.257450</td>\n",
562
+ " <td>0.271040</td>\n",
563
+ " </tr>\n",
564
+ " <tr>\n",
565
+ " <th>21</th>\n",
566
+ " <td>deduped_removed_cross</td>\n",
567
+ " <td>6</td>\n",
568
+ " <td>6000</td>\n",
569
+ " <td>0.402792</td>\n",
570
+ " <td>0.326</td>\n",
571
+ " <td>0.318</td>\n",
572
+ " <td>0.353</td>\n",
573
+ " <td>0.387</td>\n",
574
+ " <td>0.176</td>\n",
575
+ " <td>0.284</td>\n",
576
+ " <td>...</td>\n",
577
+ " <td>0.376</td>\n",
578
+ " <td>0.405</td>\n",
579
+ " <td>0.522</td>\n",
580
+ " <td>0.514</td>\n",
581
+ " <td>0.753</td>\n",
582
+ " <td>0.664</td>\n",
583
+ " <td>0.3450</td>\n",
584
+ " <td>0.3645</td>\n",
585
+ " <td>0.262549</td>\n",
586
+ " <td>0.273836</td>\n",
587
+ " </tr>\n",
588
+ " <tr>\n",
589
+ " <th>22</th>\n",
590
+ " <td>deduped_removed_cross</td>\n",
591
+ " <td>6</td>\n",
592
+ " <td>7000</td>\n",
593
+ " <td>0.408846</td>\n",
594
+ " <td>0.319</td>\n",
595
+ " <td>0.319</td>\n",
596
+ " <td>0.356</td>\n",
597
+ " <td>0.407</td>\n",
598
+ " <td>0.172</td>\n",
599
+ " <td>0.300</td>\n",
600
+ " <td>...</td>\n",
601
+ " <td>0.386</td>\n",
602
+ " <td>0.399</td>\n",
603
+ " <td>0.521</td>\n",
604
+ " <td>0.521</td>\n",
605
+ " <td>0.764</td>\n",
606
+ " <td>0.662</td>\n",
607
+ " <td>0.3585</td>\n",
608
+ " <td>0.3625</td>\n",
609
+ " <td>0.262740</td>\n",
610
+ " <td>0.276266</td>\n",
611
+ " </tr>\n",
612
+ " <tr>\n",
613
+ " <th>23</th>\n",
614
+ " <td>deduped_removed_cross</td>\n",
615
+ " <td>6</td>\n",
616
+ " <td>8000</td>\n",
617
+ " <td>0.411429</td>\n",
618
+ " <td>0.314</td>\n",
619
+ " <td>0.323</td>\n",
620
+ " <td>0.361</td>\n",
621
+ " <td>0.412</td>\n",
622
+ " <td>0.168</td>\n",
623
+ " <td>0.286</td>\n",
624
+ " <td>...</td>\n",
625
+ " <td>0.395</td>\n",
626
+ " <td>0.404</td>\n",
627
+ " <td>0.533</td>\n",
628
+ " <td>0.511</td>\n",
629
+ " <td>0.754</td>\n",
630
+ " <td>0.646</td>\n",
631
+ " <td>0.3555</td>\n",
632
+ " <td>0.3690</td>\n",
633
+ " <td>0.263875</td>\n",
634
+ " <td>0.278433</td>\n",
635
+ " </tr>\n",
636
+ " <tr>\n",
637
+ " <th>24</th>\n",
638
+ " <td>deduped_removed_cross</td>\n",
639
+ " <td>6</td>\n",
640
+ " <td>9000</td>\n",
641
+ " <td>0.417279</td>\n",
642
+ " <td>0.337</td>\n",
643
+ " <td>0.329</td>\n",
644
+ " <td>0.367</td>\n",
645
+ " <td>0.421</td>\n",
646
+ " <td>0.176</td>\n",
647
+ " <td>0.294</td>\n",
648
+ " <td>...</td>\n",
649
+ " <td>0.407</td>\n",
650
+ " <td>0.403</td>\n",
651
+ " <td>0.532</td>\n",
652
+ " <td>0.526</td>\n",
653
+ " <td>0.775</td>\n",
654
+ " <td>0.666</td>\n",
655
+ " <td>0.3605</td>\n",
656
+ " <td>0.3730</td>\n",
657
+ " <td>0.265119</td>\n",
658
+ " <td>0.283235</td>\n",
659
+ " </tr>\n",
660
+ " <tr>\n",
661
+ " <th>25</th>\n",
662
+ " <td>deduped_removed_cross</td>\n",
663
+ " <td>6</td>\n",
664
+ " <td>10000</td>\n",
665
+ " <td>0.421399</td>\n",
666
+ " <td>0.339</td>\n",
667
+ " <td>0.322</td>\n",
668
+ " <td>0.376</td>\n",
669
+ " <td>0.426</td>\n",
670
+ " <td>0.174</td>\n",
671
+ " <td>0.320</td>\n",
672
+ " <td>...</td>\n",
673
+ " <td>0.397</td>\n",
674
+ " <td>0.401</td>\n",
675
+ " <td>0.542</td>\n",
676
+ " <td>0.532</td>\n",
677
+ " <td>0.764</td>\n",
678
+ " <td>0.673</td>\n",
679
+ " <td>0.3675</td>\n",
680
+ " <td>0.3840</td>\n",
681
+ " <td>0.272474</td>\n",
682
+ " <td>0.286190</td>\n",
683
+ " </tr>\n",
684
+ " <tr>\n",
685
+ " <th>26</th>\n",
686
+ " <td>deduped_removed_cross</td>\n",
687
+ " <td>6</td>\n",
688
+ " <td>11000</td>\n",
689
+ " <td>0.421204</td>\n",
690
+ " <td>0.349</td>\n",
691
+ " <td>0.337</td>\n",
692
+ " <td>0.378</td>\n",
693
+ " <td>0.428</td>\n",
694
+ " <td>0.188</td>\n",
695
+ " <td>0.314</td>\n",
696
+ " <td>...</td>\n",
697
+ " <td>0.403</td>\n",
698
+ " <td>0.398</td>\n",
699
+ " <td>0.530</td>\n",
700
+ " <td>0.516</td>\n",
701
+ " <td>NaN</td>\n",
702
+ " <td>NaN</td>\n",
703
+ " <td>0.3690</td>\n",
704
+ " <td>0.3780</td>\n",
705
+ " <td>0.269131</td>\n",
706
+ " <td>0.288633</td>\n",
707
+ " </tr>\n",
708
+ " <tr>\n",
709
+ " <th>27</th>\n",
710
+ " <td>deduped_removed_cross</td>\n",
711
+ " <td>6</td>\n",
712
+ " <td>12000</td>\n",
713
+ " <td>0.421667</td>\n",
714
+ " <td>0.342</td>\n",
715
+ " <td>0.326</td>\n",
716
+ " <td>0.383</td>\n",
717
+ " <td>0.434</td>\n",
718
+ " <td>0.174</td>\n",
719
+ " <td>0.310</td>\n",
720
+ " <td>...</td>\n",
721
+ " <td>0.399</td>\n",
722
+ " <td>0.396</td>\n",
723
+ " <td>0.538</td>\n",
724
+ " <td>0.525</td>\n",
725
+ " <td>NaN</td>\n",
726
+ " <td>NaN</td>\n",
727
+ " <td>0.3660</td>\n",
728
+ " <td>0.3810</td>\n",
729
+ " <td>0.270691</td>\n",
730
+ " <td>0.287333</td>\n",
731
+ " </tr>\n",
732
+ " <tr>\n",
733
+ " <th>28</th>\n",
734
+ " <td>deduped_removed_cross</td>\n",
735
+ " <td>6</td>\n",
736
+ " <td>13000</td>\n",
737
+ " <td>0.424979</td>\n",
738
+ " <td>0.349</td>\n",
739
+ " <td>0.336</td>\n",
740
+ " <td>0.383</td>\n",
741
+ " <td>0.440</td>\n",
742
+ " <td>0.178</td>\n",
743
+ " <td>0.314</td>\n",
744
+ " <td>...</td>\n",
745
+ " <td>0.401</td>\n",
746
+ " <td>0.392</td>\n",
747
+ " <td>0.535</td>\n",
748
+ " <td>0.526</td>\n",
749
+ " <td>NaN</td>\n",
750
+ " <td>NaN</td>\n",
751
+ " <td>0.3785</td>\n",
752
+ " <td>0.3905</td>\n",
753
+ " <td>0.268910</td>\n",
754
+ " <td>0.289335</td>\n",
755
+ " </tr>\n",
756
+ " <tr>\n",
757
+ " <th>29</th>\n",
758
+ " <td>deduped_removed_cross</td>\n",
759
+ " <td>6</td>\n",
760
+ " <td>13500</td>\n",
761
+ " <td>0.425356</td>\n",
762
+ " <td>0.347</td>\n",
763
+ " <td>0.333</td>\n",
764
+ " <td>0.386</td>\n",
765
+ " <td>0.444</td>\n",
766
+ " <td>0.186</td>\n",
767
+ " <td>0.322</td>\n",
768
+ " <td>...</td>\n",
769
+ " <td>0.406</td>\n",
770
+ " <td>0.392</td>\n",
771
+ " <td>0.543</td>\n",
772
+ " <td>0.527</td>\n",
773
+ " <td>0.783</td>\n",
774
+ " <td>0.682</td>\n",
775
+ " <td>0.3745</td>\n",
776
+ " <td>0.3890</td>\n",
777
+ " <td>0.270869</td>\n",
778
+ " <td>0.289845</td>\n",
779
+ " </tr>\n",
780
+ " <tr>\n",
781
+ " <th>30</th>\n",
782
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
783
+ " <td>6</td>\n",
784
+ " <td>0</td>\n",
785
+ " <td>0.331018</td>\n",
786
+ " <td>0.186</td>\n",
787
+ " <td>0.233</td>\n",
788
+ " <td>0.272</td>\n",
789
+ " <td>0.258</td>\n",
790
+ " <td>0.166</td>\n",
791
+ " <td>0.286</td>\n",
792
+ " <td>...</td>\n",
793
+ " <td>0.367</td>\n",
794
+ " <td>0.362</td>\n",
795
+ " <td>0.515</td>\n",
796
+ " <td>0.497</td>\n",
797
+ " <td>NaN</td>\n",
798
+ " <td>NaN</td>\n",
799
+ " <td>0.2195</td>\n",
800
+ " <td>0.2520</td>\n",
801
+ " <td>0.230228</td>\n",
802
+ " <td>0.250147</td>\n",
803
+ " </tr>\n",
804
+ " <tr>\n",
805
+ " <th>31</th>\n",
806
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
807
+ " <td>6</td>\n",
808
+ " <td>1000</td>\n",
809
+ " <td>0.349494</td>\n",
810
+ " <td>0.217</td>\n",
811
+ " <td>0.248</td>\n",
812
+ " <td>0.288</td>\n",
813
+ " <td>0.286</td>\n",
814
+ " <td>0.104</td>\n",
815
+ " <td>0.244</td>\n",
816
+ " <td>...</td>\n",
817
+ " <td>0.366</td>\n",
818
+ " <td>0.380</td>\n",
819
+ " <td>0.499</td>\n",
820
+ " <td>0.492</td>\n",
821
+ " <td>0.546</td>\n",
822
+ " <td>0.484</td>\n",
823
+ " <td>0.2565</td>\n",
824
+ " <td>0.2780</td>\n",
825
+ " <td>0.239651</td>\n",
826
+ " <td>0.253956</td>\n",
827
+ " </tr>\n",
828
+ " <tr>\n",
829
+ " <th>32</th>\n",
830
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
831
+ " <td>6</td>\n",
832
+ " <td>2000</td>\n",
833
+ " <td>0.367893</td>\n",
834
+ " <td>0.245</td>\n",
835
+ " <td>0.280</td>\n",
836
+ " <td>0.298</td>\n",
837
+ " <td>0.288</td>\n",
838
+ " <td>0.128</td>\n",
839
+ " <td>0.280</td>\n",
840
+ " <td>...</td>\n",
841
+ " <td>0.366</td>\n",
842
+ " <td>0.383</td>\n",
843
+ " <td>0.519</td>\n",
844
+ " <td>0.499</td>\n",
845
+ " <td>NaN</td>\n",
846
+ " <td>NaN</td>\n",
847
+ " <td>0.2845</td>\n",
848
+ " <td>0.3115</td>\n",
849
+ " <td>0.239715</td>\n",
850
+ " <td>0.253644</td>\n",
851
+ " </tr>\n",
852
+ " <tr>\n",
853
+ " <th>33</th>\n",
854
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
855
+ " <td>6</td>\n",
856
+ " <td>3000</td>\n",
857
+ " <td>0.379114</td>\n",
858
+ " <td>0.269</td>\n",
859
+ " <td>0.291</td>\n",
860
+ " <td>0.304</td>\n",
861
+ " <td>0.328</td>\n",
862
+ " <td>0.138</td>\n",
863
+ " <td>0.266</td>\n",
864
+ " <td>...</td>\n",
865
+ " <td>0.362</td>\n",
866
+ " <td>0.394</td>\n",
867
+ " <td>0.519</td>\n",
868
+ " <td>0.504</td>\n",
869
+ " <td>NaN</td>\n",
870
+ " <td>NaN</td>\n",
871
+ " <td>0.3035</td>\n",
872
+ " <td>0.3335</td>\n",
873
+ " <td>0.250551</td>\n",
874
+ " <td>0.262409</td>\n",
875
+ " </tr>\n",
876
+ " <tr>\n",
877
+ " <th>34</th>\n",
878
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
879
+ " <td>6</td>\n",
880
+ " <td>4000</td>\n",
881
+ " <td>0.383025</td>\n",
882
+ " <td>0.277</td>\n",
883
+ " <td>0.289</td>\n",
884
+ " <td>0.311</td>\n",
885
+ " <td>0.338</td>\n",
886
+ " <td>0.132</td>\n",
887
+ " <td>0.280</td>\n",
888
+ " <td>...</td>\n",
889
+ " <td>0.361</td>\n",
890
+ " <td>0.393</td>\n",
891
+ " <td>0.502</td>\n",
892
+ " <td>0.496</td>\n",
893
+ " <td>NaN</td>\n",
894
+ " <td>NaN</td>\n",
895
+ " <td>0.3105</td>\n",
896
+ " <td>0.3375</td>\n",
897
+ " <td>0.249887</td>\n",
898
+ " <td>0.263702</td>\n",
899
+ " </tr>\n",
900
+ " <tr>\n",
901
+ " <th>35</th>\n",
902
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
903
+ " <td>6</td>\n",
904
+ " <td>5000</td>\n",
905
+ " <td>0.387223</td>\n",
906
+ " <td>0.290</td>\n",
907
+ " <td>0.306</td>\n",
908
+ " <td>0.327</td>\n",
909
+ " <td>0.356</td>\n",
910
+ " <td>0.138</td>\n",
911
+ " <td>0.276</td>\n",
912
+ " <td>...</td>\n",
913
+ " <td>0.365</td>\n",
914
+ " <td>0.389</td>\n",
915
+ " <td>0.515</td>\n",
916
+ " <td>0.511</td>\n",
917
+ " <td>NaN</td>\n",
918
+ " <td>NaN</td>\n",
919
+ " <td>0.3190</td>\n",
920
+ " <td>0.3380</td>\n",
921
+ " <td>0.252621</td>\n",
922
+ " <td>0.266785</td>\n",
923
+ " </tr>\n",
924
+ " <tr>\n",
925
+ " <th>36</th>\n",
926
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
927
+ " <td>6</td>\n",
928
+ " <td>6000</td>\n",
929
+ " <td>0.394011</td>\n",
930
+ " <td>0.303</td>\n",
931
+ " <td>0.305</td>\n",
932
+ " <td>0.332</td>\n",
933
+ " <td>0.356</td>\n",
934
+ " <td>0.142</td>\n",
935
+ " <td>0.288</td>\n",
936
+ " <td>...</td>\n",
937
+ " <td>0.375</td>\n",
938
+ " <td>0.397</td>\n",
939
+ " <td>0.540</td>\n",
940
+ " <td>0.521</td>\n",
941
+ " <td>NaN</td>\n",
942
+ " <td>NaN</td>\n",
943
+ " <td>0.3280</td>\n",
944
+ " <td>0.3515</td>\n",
945
+ " <td>0.252255</td>\n",
946
+ " <td>0.265589</td>\n",
947
+ " </tr>\n",
948
+ " <tr>\n",
949
+ " <th>37</th>\n",
950
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
951
+ " <td>6</td>\n",
952
+ " <td>7000</td>\n",
953
+ " <td>0.398090</td>\n",
954
+ " <td>0.316</td>\n",
955
+ " <td>0.305</td>\n",
956
+ " <td>0.337</td>\n",
957
+ " <td>0.359</td>\n",
958
+ " <td>0.142</td>\n",
959
+ " <td>0.302</td>\n",
960
+ " <td>...</td>\n",
961
+ " <td>0.372</td>\n",
962
+ " <td>0.401</td>\n",
963
+ " <td>0.531</td>\n",
964
+ " <td>0.510</td>\n",
965
+ " <td>NaN</td>\n",
966
+ " <td>NaN</td>\n",
967
+ " <td>0.3320</td>\n",
968
+ " <td>0.3550</td>\n",
969
+ " <td>0.250146</td>\n",
970
+ " <td>0.267719</td>\n",
971
+ " </tr>\n",
972
+ " <tr>\n",
973
+ " <th>38</th>\n",
974
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
975
+ " <td>6</td>\n",
976
+ " <td>8000</td>\n",
977
+ " <td>0.398513</td>\n",
978
+ " <td>0.326</td>\n",
979
+ " <td>0.315</td>\n",
980
+ " <td>0.339</td>\n",
981
+ " <td>0.372</td>\n",
982
+ " <td>0.150</td>\n",
983
+ " <td>0.288</td>\n",
984
+ " <td>...</td>\n",
985
+ " <td>0.372</td>\n",
986
+ " <td>0.396</td>\n",
987
+ " <td>0.532</td>\n",
988
+ " <td>0.508</td>\n",
989
+ " <td>NaN</td>\n",
990
+ " <td>NaN</td>\n",
991
+ " <td>0.3365</td>\n",
992
+ " <td>0.3630</td>\n",
993
+ " <td>0.258433</td>\n",
994
+ " <td>0.274100</td>\n",
995
+ " </tr>\n",
996
+ " <tr>\n",
997
+ " <th>39</th>\n",
998
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
999
+ " <td>6</td>\n",
1000
+ " <td>9000</td>\n",
1001
+ " <td>0.397494</td>\n",
1002
+ " <td>0.310</td>\n",
1003
+ " <td>0.314</td>\n",
1004
+ " <td>0.345</td>\n",
1005
+ " <td>0.374</td>\n",
1006
+ " <td>0.140</td>\n",
1007
+ " <td>0.274</td>\n",
1008
+ " <td>...</td>\n",
1009
+ " <td>0.364</td>\n",
1010
+ " <td>0.392</td>\n",
1011
+ " <td>0.529</td>\n",
1012
+ " <td>0.506</td>\n",
1013
+ " <td>NaN</td>\n",
1014
+ " <td>NaN</td>\n",
1015
+ " <td>0.3445</td>\n",
1016
+ " <td>0.3610</td>\n",
1017
+ " <td>0.258927</td>\n",
1018
+ " <td>0.271955</td>\n",
1019
+ " </tr>\n",
1020
+ " <tr>\n",
1021
+ " <th>40</th>\n",
1022
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
1023
+ " <td>6</td>\n",
1024
+ " <td>10000</td>\n",
1025
+ " <td>0.402640</td>\n",
1026
+ " <td>0.321</td>\n",
1027
+ " <td>0.327</td>\n",
1028
+ " <td>0.347</td>\n",
1029
+ " <td>0.383</td>\n",
1030
+ " <td>0.156</td>\n",
1031
+ " <td>0.280</td>\n",
1032
+ " <td>...</td>\n",
1033
+ " <td>0.376</td>\n",
1034
+ " <td>0.397</td>\n",
1035
+ " <td>0.529</td>\n",
1036
+ " <td>0.513</td>\n",
1037
+ " <td>NaN</td>\n",
1038
+ " <td>NaN</td>\n",
1039
+ " <td>0.3445</td>\n",
1040
+ " <td>0.3650</td>\n",
1041
+ " <td>0.258294</td>\n",
1042
+ " <td>0.272123</td>\n",
1043
+ " </tr>\n",
1044
+ " <tr>\n",
1045
+ " <th>41</th>\n",
1046
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
1047
+ " <td>6</td>\n",
1048
+ " <td>11000</td>\n",
1049
+ " <td>0.402599</td>\n",
1050
+ " <td>0.318</td>\n",
1051
+ " <td>0.322</td>\n",
1052
+ " <td>0.348</td>\n",
1053
+ " <td>0.381</td>\n",
1054
+ " <td>0.160</td>\n",
1055
+ " <td>0.284</td>\n",
1056
+ " <td>...</td>\n",
1057
+ " <td>0.367</td>\n",
1058
+ " <td>0.387</td>\n",
1059
+ " <td>0.538</td>\n",
1060
+ " <td>0.516</td>\n",
1061
+ " <td>NaN</td>\n",
1062
+ " <td>NaN</td>\n",
1063
+ " <td>0.3490</td>\n",
1064
+ " <td>0.3660</td>\n",
1065
+ " <td>0.259610</td>\n",
1066
+ " <td>0.276792</td>\n",
1067
+ " </tr>\n",
1068
+ " <tr>\n",
1069
+ " <th>42</th>\n",
1070
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
1071
+ " <td>6</td>\n",
1072
+ " <td>12000</td>\n",
1073
+ " <td>0.407442</td>\n",
1074
+ " <td>0.328</td>\n",
1075
+ " <td>0.319</td>\n",
1076
+ " <td>0.349</td>\n",
1077
+ " <td>0.395</td>\n",
1078
+ " <td>0.162</td>\n",
1079
+ " <td>0.290</td>\n",
1080
+ " <td>...</td>\n",
1081
+ " <td>0.367</td>\n",
1082
+ " <td>0.407</td>\n",
1083
+ " <td>0.528</td>\n",
1084
+ " <td>0.510</td>\n",
1085
+ " <td>NaN</td>\n",
1086
+ " <td>NaN</td>\n",
1087
+ " <td>0.3510</td>\n",
1088
+ " <td>0.3700</td>\n",
1089
+ " <td>0.260350</td>\n",
1090
+ " <td>0.279535</td>\n",
1091
+ " </tr>\n",
1092
+ " <tr>\n",
1093
+ " <th>43</th>\n",
1094
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
1095
+ " <td>6</td>\n",
1096
+ " <td>13000</td>\n",
1097
+ " <td>0.405577</td>\n",
1098
+ " <td>0.324</td>\n",
1099
+ " <td>0.318</td>\n",
1100
+ " <td>0.350</td>\n",
1101
+ " <td>0.385</td>\n",
1102
+ " <td>0.158</td>\n",
1103
+ " <td>0.290</td>\n",
1104
+ " <td>...</td>\n",
1105
+ " <td>0.373</td>\n",
1106
+ " <td>0.396</td>\n",
1107
+ " <td>0.538</td>\n",
1108
+ " <td>0.510</td>\n",
1109
+ " <td>NaN</td>\n",
1110
+ " <td>NaN</td>\n",
1111
+ " <td>0.3540</td>\n",
1112
+ " <td>0.3730</td>\n",
1113
+ " <td>0.258481</td>\n",
1114
+ " <td>0.274616</td>\n",
1115
+ " </tr>\n",
1116
+ " <tr>\n",
1117
+ " <th>44</th>\n",
1118
+ " <td>cross_minhash_dump_CC-MAIN-2013-48</td>\n",
1119
+ " <td>6</td>\n",
1120
+ " <td>13500</td>\n",
1121
+ " <td>0.405000</td>\n",
1122
+ " <td>0.320</td>\n",
1123
+ " <td>0.312</td>\n",
1124
+ " <td>0.354</td>\n",
1125
+ " <td>0.393</td>\n",
1126
+ " <td>0.152</td>\n",
1127
+ " <td>0.288</td>\n",
1128
+ " <td>...</td>\n",
1129
+ " <td>0.367</td>\n",
1130
+ " <td>0.396</td>\n",
1131
+ " <td>0.528</td>\n",
1132
+ " <td>0.513</td>\n",
1133
+ " <td>0.785</td>\n",
1134
+ " <td>0.675</td>\n",
1135
+ " <td>0.3590</td>\n",
1136
+ " <td>0.3660</td>\n",
1137
+ " <td>0.260174</td>\n",
1138
+ " <td>0.278002</td>\n",
1139
+ " </tr>\n",
1140
+ " </tbody>\n",
1141
+ "</table>\n",
1142
+ "<p>45 rows × 22 columns</p>\n",
1143
+ "</div>"
1144
+ ],
1145
+ "text/plain": [
1146
+ " runname seed steps agg_score \\\n",
1147
+ "0 deduped_removed_cross 5 0 0.330893 \n",
1148
+ "1 deduped_removed_cross 5 1000 0.354090 \n",
1149
+ "2 deduped_removed_cross 5 2000 0.373601 \n",
1150
+ "3 deduped_removed_cross 5 3000 0.383122 \n",
1151
+ "4 deduped_removed_cross 5 4000 0.390222 \n",
1152
+ "5 deduped_removed_cross 5 5000 0.400239 \n",
1153
+ "6 deduped_removed_cross 5 6000 0.401484 \n",
1154
+ "7 deduped_removed_cross 5 7000 0.403533 \n",
1155
+ "8 deduped_removed_cross 5 8000 0.411774 \n",
1156
+ "9 deduped_removed_cross 5 9000 0.410993 \n",
1157
+ "10 deduped_removed_cross 5 10000 0.417883 \n",
1158
+ "11 deduped_removed_cross 5 11000 0.422325 \n",
1159
+ "12 deduped_removed_cross 5 12000 0.420167 \n",
1160
+ "13 deduped_removed_cross 5 13000 0.422913 \n",
1161
+ "14 deduped_removed_cross 5 13500 0.421868 \n",
1162
+ "15 deduped_removed_cross 6 0 0.330893 \n",
1163
+ "16 deduped_removed_cross 6 1000 0.360039 \n",
1164
+ "17 deduped_removed_cross 6 2000 0.371564 \n",
1165
+ "18 deduped_removed_cross 6 3000 0.383770 \n",
1166
+ "19 deduped_removed_cross 6 4000 0.391082 \n",
1167
+ "20 deduped_removed_cross 6 5000 0.399130 \n",
1168
+ "21 deduped_removed_cross 6 6000 0.402792 \n",
1169
+ "22 deduped_removed_cross 6 7000 0.408846 \n",
1170
+ "23 deduped_removed_cross 6 8000 0.411429 \n",
1171
+ "24 deduped_removed_cross 6 9000 0.417279 \n",
1172
+ "25 deduped_removed_cross 6 10000 0.421399 \n",
1173
+ "26 deduped_removed_cross 6 11000 0.421204 \n",
1174
+ "27 deduped_removed_cross 6 12000 0.421667 \n",
1175
+ "28 deduped_removed_cross 6 13000 0.424979 \n",
1176
+ "29 deduped_removed_cross 6 13500 0.425356 \n",
1177
+ "30 cross_minhash_dump_CC-MAIN-2013-48 6 0 0.331018 \n",
1178
+ "31 cross_minhash_dump_CC-MAIN-2013-48 6 1000 0.349494 \n",
1179
+ "32 cross_minhash_dump_CC-MAIN-2013-48 6 2000 0.367893 \n",
1180
+ "33 cross_minhash_dump_CC-MAIN-2013-48 6 3000 0.379114 \n",
1181
+ "34 cross_minhash_dump_CC-MAIN-2013-48 6 4000 0.383025 \n",
1182
+ "35 cross_minhash_dump_CC-MAIN-2013-48 6 5000 0.387223 \n",
1183
+ "36 cross_minhash_dump_CC-MAIN-2013-48 6 6000 0.394011 \n",
1184
+ "37 cross_minhash_dump_CC-MAIN-2013-48 6 7000 0.398090 \n",
1185
+ "38 cross_minhash_dump_CC-MAIN-2013-48 6 8000 0.398513 \n",
1186
+ "39 cross_minhash_dump_CC-MAIN-2013-48 6 9000 0.397494 \n",
1187
+ "40 cross_minhash_dump_CC-MAIN-2013-48 6 10000 0.402640 \n",
1188
+ "41 cross_minhash_dump_CC-MAIN-2013-48 6 11000 0.402599 \n",
1189
+ "42 cross_minhash_dump_CC-MAIN-2013-48 6 12000 0.407442 \n",
1190
+ "43 cross_minhash_dump_CC-MAIN-2013-48 6 13000 0.405577 \n",
1191
+ "44 cross_minhash_dump_CC-MAIN-2013-48 6 13500 0.405000 \n",
1192
+ "\n",
1193
+ " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n",
1194
+ "0 0.186 0.233 0.272 \n",
1195
+ "1 0.253 0.257 0.290 \n",
1196
+ "2 0.274 0.290 0.313 \n",
1197
+ "3 0.306 0.292 0.323 \n",
1198
+ "4 0.300 0.292 0.324 \n",
1199
+ "5 0.322 0.308 0.325 \n",
1200
+ "6 0.315 0.314 0.341 \n",
1201
+ "7 0.324 0.315 0.350 \n",
1202
+ "8 0.344 0.313 0.352 \n",
1203
+ "9 0.335 0.322 0.361 \n",
1204
+ "10 0.330 0.320 0.370 \n",
1205
+ "11 0.332 0.328 0.366 \n",
1206
+ "12 0.348 0.324 0.364 \n",
1207
+ "13 0.346 0.330 0.372 \n",
1208
+ "14 0.345 0.322 0.370 \n",
1209
+ "15 0.186 0.233 0.272 \n",
1210
+ "16 0.236 0.259 0.283 \n",
1211
+ "17 0.270 0.283 0.303 \n",
1212
+ "18 0.283 0.286 0.323 \n",
1213
+ "19 0.293 0.298 0.339 \n",
1214
+ "20 0.309 0.311 0.343 \n",
1215
+ "21 0.326 0.318 0.353 \n",
1216
+ "22 0.319 0.319 0.356 \n",
1217
+ "23 0.314 0.323 0.361 \n",
1218
+ "24 0.337 0.329 0.367 \n",
1219
+ "25 0.339 0.322 0.376 \n",
1220
+ "26 0.349 0.337 0.378 \n",
1221
+ "27 0.342 0.326 0.383 \n",
1222
+ "28 0.349 0.336 0.383 \n",
1223
+ "29 0.347 0.333 0.386 \n",
1224
+ "30 0.186 0.233 0.272 \n",
1225
+ "31 0.217 0.248 0.288 \n",
1226
+ "32 0.245 0.280 0.298 \n",
1227
+ "33 0.269 0.291 0.304 \n",
1228
+ "34 0.277 0.289 0.311 \n",
1229
+ "35 0.290 0.306 0.327 \n",
1230
+ "36 0.303 0.305 0.332 \n",
1231
+ "37 0.316 0.305 0.337 \n",
1232
+ "38 0.326 0.315 0.339 \n",
1233
+ "39 0.310 0.314 0.345 \n",
1234
+ "40 0.321 0.327 0.347 \n",
1235
+ "41 0.318 0.322 0.348 \n",
1236
+ "42 0.328 0.319 0.349 \n",
1237
+ "43 0.324 0.318 0.350 \n",
1238
+ "44 0.320 0.312 0.354 \n",
1239
+ "\n",
1240
+ " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n",
1241
+ "0 0.258 0.166 0.286 ... 0.367 \n",
1242
+ "1 0.278 0.124 0.264 ... 0.368 \n",
1243
+ "2 0.312 0.116 0.258 ... 0.367 \n",
1244
+ "3 0.335 0.150 0.278 ... 0.371 \n",
1245
+ "4 0.351 0.144 0.278 ... 0.386 \n",
1246
+ "5 0.364 0.172 0.298 ... 0.382 \n",
1247
+ "6 0.372 0.162 0.314 ... 0.377 \n",
1248
+ "7 0.386 0.188 0.298 ... 0.376 \n",
1249
+ "8 0.409 0.170 0.310 ... 0.374 \n",
1250
+ "9 0.404 0.182 0.294 ... 0.374 \n",
1251
+ "10 0.417 0.192 0.324 ... 0.389 \n",
1252
+ "11 0.426 0.188 0.320 ... 0.398 \n",
1253
+ "12 0.434 0.194 0.306 ... 0.377 \n",
1254
+ "13 0.438 0.190 0.320 ... 0.392 \n",
1255
+ "14 0.431 0.202 0.330 ... 0.387 \n",
1256
+ "15 0.258 0.166 0.286 ... 0.367 \n",
1257
+ "16 0.277 0.130 0.274 ... 0.354 \n",
1258
+ "17 0.305 0.132 0.280 ... 0.377 \n",
1259
+ "18 0.320 0.156 0.296 ... 0.375 \n",
1260
+ "19 0.361 0.160 0.292 ... 0.380 \n",
1261
+ "20 0.376 0.160 0.286 ... 0.392 \n",
1262
+ "21 0.387 0.176 0.284 ... 0.376 \n",
1263
+ "22 0.407 0.172 0.300 ... 0.386 \n",
1264
+ "23 0.412 0.168 0.286 ... 0.395 \n",
1265
+ "24 0.421 0.176 0.294 ... 0.407 \n",
1266
+ "25 0.426 0.174 0.320 ... 0.397 \n",
1267
+ "26 0.428 0.188 0.314 ... 0.403 \n",
1268
+ "27 0.434 0.174 0.310 ... 0.399 \n",
1269
+ "28 0.440 0.178 0.314 ... 0.401 \n",
1270
+ "29 0.444 0.186 0.322 ... 0.406 \n",
1271
+ "30 0.258 0.166 0.286 ... 0.367 \n",
1272
+ "31 0.286 0.104 0.244 ... 0.366 \n",
1273
+ "32 0.288 0.128 0.280 ... 0.366 \n",
1274
+ "33 0.328 0.138 0.266 ... 0.362 \n",
1275
+ "34 0.338 0.132 0.280 ... 0.361 \n",
1276
+ "35 0.356 0.138 0.276 ... 0.365 \n",
1277
+ "36 0.356 0.142 0.288 ... 0.375 \n",
1278
+ "37 0.359 0.142 0.302 ... 0.372 \n",
1279
+ "38 0.372 0.150 0.288 ... 0.372 \n",
1280
+ "39 0.374 0.140 0.274 ... 0.364 \n",
1281
+ "40 0.383 0.156 0.280 ... 0.376 \n",
1282
+ "41 0.381 0.160 0.284 ... 0.367 \n",
1283
+ "42 0.395 0.162 0.290 ... 0.367 \n",
1284
+ "43 0.385 0.158 0.290 ... 0.373 \n",
1285
+ "44 0.393 0.152 0.288 ... 0.367 \n",
1286
+ "\n",
1287
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
1288
+ "0 0.362 0.516 0.497 0.208 \n",
1289
+ "1 0.389 0.509 0.491 0.582 \n",
1290
+ "2 0.397 0.516 0.505 0.686 \n",
1291
+ "3 0.401 0.513 0.500 0.712 \n",
1292
+ "4 0.395 0.511 0.511 0.750 \n",
1293
+ "5 0.398 0.518 0.522 0.751 \n",
1294
+ "6 0.390 0.498 0.492 0.776 \n",
1295
+ "7 0.384 0.518 0.521 0.769 \n",
1296
+ "8 0.391 0.530 0.521 0.781 \n",
1297
+ "9 0.391 0.526 0.514 0.769 \n",
1298
+ "10 0.389 0.518 0.524 0.785 \n",
1299
+ "11 0.397 0.535 0.529 0.801 \n",
1300
+ "12 0.392 0.541 0.527 0.790 \n",
1301
+ "13 0.396 0.540 0.522 0.802 \n",
1302
+ "14 0.392 0.540 0.516 0.797 \n",
1303
+ "15 0.362 0.516 0.497 0.208 \n",
1304
+ "16 0.386 0.509 0.507 0.559 \n",
1305
+ "17 0.392 0.522 0.504 0.665 \n",
1306
+ "18 0.394 0.503 0.497 0.721 \n",
1307
+ "19 0.399 0.505 0.494 0.719 \n",
1308
+ "20 0.401 0.525 0.512 0.733 \n",
1309
+ "21 0.405 0.522 0.514 0.753 \n",
1310
+ "22 0.399 0.521 0.521 0.764 \n",
1311
+ "23 0.404 0.533 0.511 0.754 \n",
1312
+ "24 0.403 0.532 0.526 0.775 \n",
1313
+ "25 0.401 0.542 0.532 0.764 \n",
1314
+ "26 0.398 0.530 0.516 NaN \n",
1315
+ "27 0.396 0.538 0.525 NaN \n",
1316
+ "28 0.392 0.535 0.526 NaN \n",
1317
+ "29 0.392 0.543 0.527 0.783 \n",
1318
+ "30 0.362 0.515 0.497 NaN \n",
1319
+ "31 0.380 0.499 0.492 0.546 \n",
1320
+ "32 0.383 0.519 0.499 NaN \n",
1321
+ "33 0.394 0.519 0.504 NaN \n",
1322
+ "34 0.393 0.502 0.496 NaN \n",
1323
+ "35 0.389 0.515 0.511 NaN \n",
1324
+ "36 0.397 0.540 0.521 NaN \n",
1325
+ "37 0.401 0.531 0.510 NaN \n",
1326
+ "38 0.396 0.532 0.508 NaN \n",
1327
+ "39 0.392 0.529 0.506 NaN \n",
1328
+ "40 0.397 0.529 0.513 NaN \n",
1329
+ "41 0.387 0.538 0.516 NaN \n",
1330
+ "42 0.407 0.528 0.510 NaN \n",
1331
+ "43 0.396 0.538 0.510 NaN \n",
1332
+ "44 0.396 0.528 0.513 0.785 \n",
1333
+ "\n",
1334
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
1335
+ "0 0.202 0.2195 0.2510 0.230294 0.250147 \n",
1336
+ "1 0.516 0.2825 0.2955 0.239520 0.253223 \n",
1337
+ "2 0.582 0.3090 0.3200 0.247320 0.262812 \n",
1338
+ "3 0.611 0.3075 0.3415 0.248568 0.263474 \n",
1339
+ "4 0.658 0.3260 0.3445 0.259246 0.273276 \n",
1340
+ "5 0.661 0.3470 0.3545 0.258485 0.271414 \n",
1341
+ "6 0.669 0.3530 0.3565 0.261842 0.276371 \n",
1342
+ "7 0.672 0.3625 0.3585 0.265558 0.274768 \n",
1343
+ "8 0.677 0.3530 0.3615 0.267141 0.283691 \n",
1344
+ "9 0.672 0.3630 0.3715 0.266464 0.284446 \n",
1345
+ "10 0.682 0.3735 0.3745 0.268085 0.283562 \n",
1346
+ "11 0.695 0.3775 0.3800 0.267457 0.285596 \n",
1347
+ "12 0.690 0.3680 0.3755 0.267547 0.285836 \n",
1348
+ "13 0.707 0.3760 0.3845 0.271108 0.287802 \n",
1349
+ "14 0.700 0.3790 0.3870 0.269510 0.287944 \n",
1350
+ "15 0.202 0.2195 0.2510 0.230294 0.250147 \n",
1351
+ "16 0.500 0.2590 0.2970 0.243455 0.254311 \n",
1352
+ "17 0.566 0.3040 0.3135 0.249051 0.255010 \n",
1353
+ "18 0.626 0.3140 0.3410 0.254015 0.266158 \n",
1354
+ "19 0.615 0.3375 0.3375 0.256696 0.268152 \n",
1355
+ "20 0.639 0.3390 0.3580 0.257450 0.271040 \n",
1356
+ "21 0.664 0.3450 0.3645 0.262549 0.273836 \n",
1357
+ "22 0.662 0.3585 0.3625 0.262740 0.276266 \n",
1358
+ "23 0.646 0.3555 0.3690 0.263875 0.278433 \n",
1359
+ "24 0.666 0.3605 0.3730 0.265119 0.283235 \n",
1360
+ "25 0.673 0.3675 0.3840 0.272474 0.286190 \n",
1361
+ "26 NaN 0.3690 0.3780 0.269131 0.288633 \n",
1362
+ "27 NaN 0.3660 0.3810 0.270691 0.287333 \n",
1363
+ "28 NaN 0.3785 0.3905 0.268910 0.289335 \n",
1364
+ "29 0.682 0.3745 0.3890 0.270869 0.289845 \n",
1365
+ "30 NaN 0.2195 0.2520 0.230228 0.250147 \n",
1366
+ "31 0.484 0.2565 0.2780 0.239651 0.253956 \n",
1367
+ "32 NaN 0.2845 0.3115 0.239715 0.253644 \n",
1368
+ "33 NaN 0.3035 0.3335 0.250551 0.262409 \n",
1369
+ "34 NaN 0.3105 0.3375 0.249887 0.263702 \n",
1370
+ "35 NaN 0.3190 0.3380 0.252621 0.266785 \n",
1371
+ "36 NaN 0.3280 0.3515 0.252255 0.265589 \n",
1372
+ "37 NaN 0.3320 0.3550 0.250146 0.267719 \n",
1373
+ "38 NaN 0.3365 0.3630 0.258433 0.274100 \n",
1374
+ "39 NaN 0.3445 0.3610 0.258927 0.271955 \n",
1375
+ "40 NaN 0.3445 0.3650 0.258294 0.272123 \n",
1376
+ "41 NaN 0.3490 0.3660 0.259610 0.276792 \n",
1377
+ "42 NaN 0.3510 0.3700 0.260350 0.279535 \n",
1378
+ "43 NaN 0.3540 0.3730 0.258481 0.274616 \n",
1379
+ "44 0.675 0.3590 0.3660 0.260174 0.278002 \n",
1380
+ "\n",
1381
+ "[45 rows x 22 columns]"
1382
+ ]
1383
+ },
1384
+ "execution_count": 23,
1385
+ "metadata": {},
1386
+ "output_type": "execute_result"
1387
+ }
1388
+ ],
1389
+ "source": [
1390
+ "import pandas as pd\n",
1391
+ "from matplotlib.figure import Figure\n",
1392
+ "\n",
1393
+ "df = pd.read_csv(\"../src_data/removed_data_cross_dedup.csv\")\n",
1394
+ "df"
1395
+ ]
1396
+ },
1397
+ {
1398
+ "cell_type": "code",
1399
+ "execution_count": 24,
1400
+ "id": "b610f43caefdf01",
1401
+ "metadata": {
1402
+ "ExecuteTime": {
1403
+ "end_time": "2024-04-30T13:29:05.776714Z",
1404
+ "start_time": "2024-04-30T13:29:05.774546Z"
1405
+ },
1406
+ "collapsed": false
1407
+ },
1408
+ "outputs": [],
1409
+ "source": [
1410
+ "runs_mapping = {\n",
1411
+ " \"deduped_removed_cross\": \"Originally removed data\",\n",
1412
+ " \"cross_minhash_dump_CC-MAIN-2013-48\": \"Originally kept data\",\n",
1413
+ "}"
1414
+ ]
1415
+ },
1416
+ {
1417
+ "cell_type": "code",
1418
+ "execution_count": 25,
1419
+ "id": "18b2dde6",
1420
+ "metadata": {},
1421
+ "outputs": [
1422
+ {
1423
+ "data": {
1424
+ "text/plain": [
1425
+ "Index(['runname', 'seed', 'steps', 'agg_score', 'commonsense_qa/acc',\n",
1426
+ " 'commonsense_qa/acc_norm', 'hellaswag/acc', 'hellaswag/acc_norm',\n",
1427
+ " 'openbookqa/acc', 'openbookqa/acc_norm', 'piqa/acc', 'piqa/acc_norm',\n",
1428
+ " 'siqa/acc', 'siqa/acc_norm', 'winogrande/acc', 'winogrande/acc_norm',\n",
1429
+ " 'sciq/acc', 'sciq/acc_norm', 'arc/acc', 'arc/acc_norm', 'mmlu/acc',\n",
1430
+ " 'mmlu/acc_norm'],\n",
1431
+ " dtype='object')"
1432
+ ]
1433
+ },
1434
+ "execution_count": 25,
1435
+ "metadata": {},
1436
+ "output_type": "execute_result"
1437
+ }
1438
+ ],
1439
+ "source": [
1440
+ "df.columns"
1441
+ ]
1442
+ },
1443
+ {
1444
+ "cell_type": "code",
1445
+ "execution_count": 27,
1446
+ "id": "initial_id",
1447
+ "metadata": {
1448
+ "ExecuteTime": {
1449
+ "end_time": "2024-04-30T13:31:10.740797Z",
1450
+ "start_time": "2024-04-30T13:31:10.661359Z"
1451
+ },
1452
+ "collapsed": true
1453
+ },
1454
+ "outputs": [
1455
+ {
1456
+ "name": "stderr",
1457
+ "output_type": "stream",
1458
+ "text": [
1459
+ "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n"
1460
+ ]
1461
+ },
1462
+ {
1463
+ "data": {
1464
+ "image/png": "",
1465
+ "text/plain": [
1466
+ "<Figure size 640x480 with 1 Axes>"
1467
+ ]
1468
+ },
1469
+ "metadata": {},
1470
+ "output_type": "display_data"
1471
+ }
1472
+ ],
1473
+ "source": [
1474
+ "import json\n",
1475
+ "import os\n",
1476
+ "from matplotlib import pyplot as plt\n",
1477
+ "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
1478
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
1479
+ "\n",
1480
+ "def normalize_runname(runname):\n",
1481
+ " return runname.replace(\"/\", \"_\")\n",
1482
+ "\n",
1483
+ "grouped = (\n",
1484
+ " df.groupby([\"runname\", \"steps\"])\n",
1485
+ " .agg(\n",
1486
+ " {\n",
1487
+ " key: \"mean\" for key in metrics\n",
1488
+ " }\n",
1489
+ " )\n",
1490
+ " .reset_index()\n",
1491
+ ")\n",
1492
+ "\n",
1493
+ "file_id=\"../assets/data/plots/removed_data_dedup\"\n",
1494
+ "files = {}\n",
1495
+ "for metric in metrics:\n",
1496
+ " datas = {}\n",
1497
+ " for name, group in grouped.groupby(\"runname\"):\n",
1498
+ " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n",
1499
+ " group = group.set_index(\"steps\")\n",
1500
+ " rolling_avg = group\n",
1501
+ " # rolling_avg = group.rolling(window=5).mean()\n",
1502
+ " datas[name] = {\n",
1503
+ " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n",
1504
+ " \"y\": rolling_avg[metric].tolist(),\n",
1505
+ " \"label\": runs_mapping[name],\n",
1506
+ " }\n",
1507
+ " # Sort the datata based on the steps\n",
1508
+ " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n",
1509
+ " # Create a folder\n",
1510
+ " os.makedirs(f\"{file_id}\", exist_ok=True)\n",
1511
+ " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n",
1512
+ " json.dump({\n",
1513
+ " \"data\": datas,\n",
1514
+ " \"layout\": {\n",
1515
+ " \"title\": {\n",
1516
+ " \"text\": \"The originally removed data outperforms the kept data\"\n",
1517
+ " },\n",
1518
+ " }\n",
1519
+ " }, f)\n",
1520
+ " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n",
1521
+ "# Create index\n",
1522
+ "with open(f\"{file_id}/index.json\", \"w\") as f:\n",
1523
+ " json.dump({\n",
1524
+ " \"files\": files,\n",
1525
+ " \"settings\": {\n",
1526
+ " \"defaultMetric\": \"agg_score\",\n",
1527
+ " \"slider\":{\"min\":0,\"max\":10,\"default\":0}\n",
1528
+ " }\n",
1529
+ " }, f)\n",
1530
+ " \n",
1531
+ "\n",
1532
+ "# Add labels and legend\n",
1533
+ "plt.xlabel(\"Training tokens (billions)\")\n",
1534
+ "plt.ylabel(\"Agg Score\")\n",
1535
+ "plt.title(\"The originally removed data outperforms the kept data\")\n",
1536
+ "plt.legend()\n",
1537
+ "\n",
1538
+ "# Show the plot\n",
1539
+ "plt.show()"
1540
+ ]
1541
+ },
1542
+ {
1543
+ "cell_type": "code",
1544
+ "execution_count": 3,
1545
+ "id": "af28ebbd054cdc33",
1546
+ "metadata": {
1547
+ "ExecuteTime": {
1548
+ "end_time": "2024-04-30T12:52:05.836260Z",
1549
+ "start_time": "2024-04-30T12:52:05.834381Z"
1550
+ },
1551
+ "collapsed": false
1552
+ },
1553
+ "outputs": [],
1554
+ "source": []
1555
+ }
1556
+ ],
1557
+ "metadata": {
1558
+ "kernelspec": {
1559
+ "display_name": "Python 3",
1560
+ "language": "python",
1561
+ "name": "python3"
1562
+ },
1563
+ "language_info": {
1564
+ "codemirror_mode": {
1565
+ "name": "ipython",
1566
+ "version": 3
1567
+ },
1568
+ "file_extension": ".py",
1569
+ "mimetype": "text/x-python",
1570
+ "name": "python",
1571
+ "nbconvert_exporter": "python",
1572
+ "pygments_lexer": "ipython3",
1573
+ "version": "3.12.2"
1574
+ }
1575
+ },
1576
+ "nbformat": 4,
1577
+ "nbformat_minor": 5
1578
+ }
notebooks/plot_wet_comparison.ipynb ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 6,
6
+ "id": "138889b92720ce2e",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2024-05-13T15:30:52.864251Z",
10
+ "start_time": "2024-05-13T15:30:52.316016Z"
11
+ },
12
+ "collapsed": false
13
+ },
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/html": [
18
+ "<div>\n",
19
+ "<style scoped>\n",
20
+ " .dataframe tbody tr th:only-of-type {\n",
21
+ " vertical-align: middle;\n",
22
+ " }\n",
23
+ "\n",
24
+ " .dataframe tbody tr th {\n",
25
+ " vertical-align: top;\n",
26
+ " }\n",
27
+ "\n",
28
+ " .dataframe thead th {\n",
29
+ " text-align: right;\n",
30
+ " }\n",
31
+ "</style>\n",
32
+ "<table border=\"1\" class=\"dataframe\">\n",
33
+ " <thead>\n",
34
+ " <tr style=\"text-align: right;\">\n",
35
+ " <th></th>\n",
36
+ " <th>runname</th>\n",
37
+ " <th>seed</th>\n",
38
+ " <th>steps</th>\n",
39
+ " <th>agg_score</th>\n",
40
+ " <th>commonsense_qa/acc</th>\n",
41
+ " <th>commonsense_qa/acc_norm</th>\n",
42
+ " <th>hellaswag/acc</th>\n",
43
+ " <th>hellaswag/acc_norm</th>\n",
44
+ " <th>openbookqa/acc</th>\n",
45
+ " <th>openbookqa/acc_norm</th>\n",
46
+ " <th>...</th>\n",
47
+ " <th>siqa/acc</th>\n",
48
+ " <th>siqa/acc_norm</th>\n",
49
+ " <th>winogrande/acc</th>\n",
50
+ " <th>winogrande/acc_norm</th>\n",
51
+ " <th>sciq/acc</th>\n",
52
+ " <th>sciq/acc_norm</th>\n",
53
+ " <th>arc/acc</th>\n",
54
+ " <th>arc/acc_norm</th>\n",
55
+ " <th>mmlu/acc</th>\n",
56
+ " <th>mmlu/acc_norm</th>\n",
57
+ " </tr>\n",
58
+ " </thead>\n",
59
+ " <tbody>\n",
60
+ " <tr>\n",
61
+ " <th>0</th>\n",
62
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
63
+ " <td>5</td>\n",
64
+ " <td>0</td>\n",
65
+ " <td>0.330953</td>\n",
66
+ " <td>0.186</td>\n",
67
+ " <td>0.233</td>\n",
68
+ " <td>0.272</td>\n",
69
+ " <td>0.258</td>\n",
70
+ " <td>0.166</td>\n",
71
+ " <td>0.286</td>\n",
72
+ " <td>...</td>\n",
73
+ " <td>0.367</td>\n",
74
+ " <td>0.362</td>\n",
75
+ " <td>0.516</td>\n",
76
+ " <td>0.497</td>\n",
77
+ " <td>0.210</td>\n",
78
+ " <td>0.202</td>\n",
79
+ " <td>0.2190</td>\n",
80
+ " <td>0.2515</td>\n",
81
+ " <td>0.230285</td>\n",
82
+ " <td>0.250127</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>1</th>\n",
86
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
87
+ " <td>5</td>\n",
88
+ " <td>1000</td>\n",
89
+ " <td>0.357474</td>\n",
90
+ " <td>0.239</td>\n",
91
+ " <td>0.271</td>\n",
92
+ " <td>0.297</td>\n",
93
+ " <td>0.287</td>\n",
94
+ " <td>0.146</td>\n",
95
+ " <td>0.260</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>0.365</td>\n",
98
+ " <td>0.396</td>\n",
99
+ " <td>0.503</td>\n",
100
+ " <td>0.486</td>\n",
101
+ " <td>0.568</td>\n",
102
+ " <td>0.502</td>\n",
103
+ " <td>0.2665</td>\n",
104
+ " <td>0.2855</td>\n",
105
+ " <td>0.242526</td>\n",
106
+ " <td>0.253291</td>\n",
107
+ " </tr>\n",
108
+ " <tr>\n",
109
+ " <th>2</th>\n",
110
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
111
+ " <td>5</td>\n",
112
+ " <td>2000</td>\n",
113
+ " <td>0.377436</td>\n",
114
+ " <td>0.280</td>\n",
115
+ " <td>0.284</td>\n",
116
+ " <td>0.321</td>\n",
117
+ " <td>0.332</td>\n",
118
+ " <td>0.134</td>\n",
119
+ " <td>0.268</td>\n",
120
+ " <td>...</td>\n",
121
+ " <td>0.368</td>\n",
122
+ " <td>0.399</td>\n",
123
+ " <td>0.519</td>\n",
124
+ " <td>0.502</td>\n",
125
+ " <td>0.686</td>\n",
126
+ " <td>0.590</td>\n",
127
+ " <td>0.3030</td>\n",
128
+ " <td>0.3215</td>\n",
129
+ " <td>0.245745</td>\n",
130
+ " <td>0.260988</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>3</th>\n",
134
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
135
+ " <td>5</td>\n",
136
+ " <td>3000</td>\n",
137
+ " <td>0.387994</td>\n",
138
+ " <td>0.277</td>\n",
139
+ " <td>0.291</td>\n",
140
+ " <td>0.339</td>\n",
141
+ " <td>0.359</td>\n",
142
+ " <td>0.132</td>\n",
143
+ " <td>0.280</td>\n",
144
+ " <td>...</td>\n",
145
+ " <td>0.394</td>\n",
146
+ " <td>0.404</td>\n",
147
+ " <td>0.520</td>\n",
148
+ " <td>0.503</td>\n",
149
+ " <td>0.721</td>\n",
150
+ " <td>0.622</td>\n",
151
+ " <td>0.3210</td>\n",
152
+ " <td>0.3385</td>\n",
153
+ " <td>0.250427</td>\n",
154
+ " <td>0.264451</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>4</th>\n",
158
+ " <td>filtering-baseline-2019-18-40gt</td>\n",
159
+ " <td>5</td>\n",
160
+ " <td>4000</td>\n",
161
+ " <td>0.396110</td>\n",
162
+ " <td>0.299</td>\n",
163
+ " <td>0.315</td>\n",
164
+ " <td>0.340</td>\n",
165
+ " <td>0.366</td>\n",
166
+ " <td>0.158</td>\n",
167
+ " <td>0.286</td>\n",
168
+ " <td>...</td>\n",
169
+ " <td>0.376</td>\n",
170
+ " <td>0.399</td>\n",
171
+ " <td>0.515</td>\n",
172
+ " <td>0.500</td>\n",
173
+ " <td>0.739</td>\n",
174
+ " <td>0.620</td>\n",
175
+ " <td>0.3320</td>\n",
176
+ " <td>0.3445</td>\n",
177
+ " <td>0.256134</td>\n",
178
+ " <td>0.270382</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>...</th>\n",
182
+ " <td>...</td>\n",
183
+ " <td>...</td>\n",
184
+ " <td>...</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " <td>...</td>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " <td>...</td>\n",
194
+ " <td>...</td>\n",
195
+ " <td>...</td>\n",
196
+ " <td>...</td>\n",
197
+ " <td>...</td>\n",
198
+ " <td>...</td>\n",
199
+ " <td>...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " </tr>\n",
204
+ " <tr>\n",
205
+ " <th>115</th>\n",
206
+ " <td>wet-extraction-2019-18</td>\n",
207
+ " <td>6</td>\n",
208
+ " <td>10000</td>\n",
209
+ " <td>0.408977</td>\n",
210
+ " <td>0.326</td>\n",
211
+ " <td>0.312</td>\n",
212
+ " <td>0.362</td>\n",
213
+ " <td>0.412</td>\n",
214
+ " <td>0.166</td>\n",
215
+ " <td>0.312</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>0.379</td>\n",
218
+ " <td>0.396</td>\n",
219
+ " <td>0.525</td>\n",
220
+ " <td>0.517</td>\n",
221
+ " <td>0.767</td>\n",
222
+ " <td>0.654</td>\n",
223
+ " <td>0.3480</td>\n",
224
+ " <td>0.3560</td>\n",
225
+ " <td>0.262357</td>\n",
226
+ " <td>0.276813</td>\n",
227
+ " </tr>\n",
228
+ " <tr>\n",
229
+ " <th>116</th>\n",
230
+ " <td>wet-extraction-2019-18</td>\n",
231
+ " <td>6</td>\n",
232
+ " <td>11000</td>\n",
233
+ " <td>0.408771</td>\n",
234
+ " <td>0.325</td>\n",
235
+ " <td>0.315</td>\n",
236
+ " <td>0.363</td>\n",
237
+ " <td>0.409</td>\n",
238
+ " <td>0.162</td>\n",
239
+ " <td>0.312</td>\n",
240
+ " <td>...</td>\n",
241
+ " <td>0.388</td>\n",
242
+ " <td>0.399</td>\n",
243
+ " <td>0.529</td>\n",
244
+ " <td>0.520</td>\n",
245
+ " <td>0.777</td>\n",
246
+ " <td>0.664</td>\n",
247
+ " <td>0.3465</td>\n",
248
+ " <td>0.3555</td>\n",
249
+ " <td>0.261599</td>\n",
250
+ " <td>0.276664</td>\n",
251
+ " </tr>\n",
252
+ " <tr>\n",
253
+ " <th>117</th>\n",
254
+ " <td>wet-extraction-2019-18</td>\n",
255
+ " <td>6</td>\n",
256
+ " <td>12000</td>\n",
257
+ " <td>0.408239</td>\n",
258
+ " <td>0.329</td>\n",
259
+ " <td>0.308</td>\n",
260
+ " <td>0.364</td>\n",
261
+ " <td>0.416</td>\n",
262
+ " <td>0.178</td>\n",
263
+ " <td>0.308</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>0.382</td>\n",
266
+ " <td>0.398</td>\n",
267
+ " <td>0.521</td>\n",
268
+ " <td>0.510</td>\n",
269
+ " <td>0.770</td>\n",
270
+ " <td>0.656</td>\n",
271
+ " <td>0.3555</td>\n",
272
+ " <td>0.3595</td>\n",
273
+ " <td>0.260928</td>\n",
274
+ " <td>0.278411</td>\n",
275
+ " </tr>\n",
276
+ " <tr>\n",
277
+ " <th>118</th>\n",
278
+ " <td>wet-extraction-2019-18</td>\n",
279
+ " <td>6</td>\n",
280
+ " <td>13000</td>\n",
281
+ " <td>0.413263</td>\n",
282
+ " <td>0.325</td>\n",
283
+ " <td>0.308</td>\n",
284
+ " <td>0.367</td>\n",
285
+ " <td>0.425</td>\n",
286
+ " <td>0.174</td>\n",
287
+ " <td>0.312</td>\n",
288
+ " <td>...</td>\n",
289
+ " <td>0.387</td>\n",
290
+ " <td>0.411</td>\n",
291
+ " <td>0.523</td>\n",
292
+ " <td>0.524</td>\n",
293
+ " <td>0.774</td>\n",
294
+ " <td>0.662</td>\n",
295
+ " <td>0.3570</td>\n",
296
+ " <td>0.3600</td>\n",
297
+ " <td>0.263067</td>\n",
298
+ " <td>0.281104</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>119</th>\n",
302
+ " <td>wet-extraction-2019-18</td>\n",
303
+ " <td>6</td>\n",
304
+ " <td>13500</td>\n",
305
+ " <td>0.410754</td>\n",
306
+ " <td>0.335</td>\n",
307
+ " <td>0.310</td>\n",
308
+ " <td>0.366</td>\n",
309
+ " <td>0.424</td>\n",
310
+ " <td>0.164</td>\n",
311
+ " <td>0.300</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>0.392</td>\n",
314
+ " <td>0.407</td>\n",
315
+ " <td>0.515</td>\n",
316
+ " <td>0.519</td>\n",
317
+ " <td>0.779</td>\n",
318
+ " <td>0.668</td>\n",
319
+ " <td>0.3590</td>\n",
320
+ " <td>0.3565</td>\n",
321
+ " <td>0.261681</td>\n",
322
+ " <td>0.279534</td>\n",
323
+ " </tr>\n",
324
+ " </tbody>\n",
325
+ "</table>\n",
326
+ "<p>120 rows × 22 columns</p>\n",
327
+ "</div>"
328
+ ],
329
+ "text/plain": [
330
+ " runname seed steps agg_score \\\n",
331
+ "0 filtering-baseline-2019-18-40gt 5 0 0.330953 \n",
332
+ "1 filtering-baseline-2019-18-40gt 5 1000 0.357474 \n",
333
+ "2 filtering-baseline-2019-18-40gt 5 2000 0.377436 \n",
334
+ "3 filtering-baseline-2019-18-40gt 5 3000 0.387994 \n",
335
+ "4 filtering-baseline-2019-18-40gt 5 4000 0.396110 \n",
336
+ ".. ... ... ... ... \n",
337
+ "115 wet-extraction-2019-18 6 10000 0.408977 \n",
338
+ "116 wet-extraction-2019-18 6 11000 0.408771 \n",
339
+ "117 wet-extraction-2019-18 6 12000 0.408239 \n",
340
+ "118 wet-extraction-2019-18 6 13000 0.413263 \n",
341
+ "119 wet-extraction-2019-18 6 13500 0.410754 \n",
342
+ "\n",
343
+ " commonsense_qa/acc commonsense_qa/acc_norm hellaswag/acc \\\n",
344
+ "0 0.186 0.233 0.272 \n",
345
+ "1 0.239 0.271 0.297 \n",
346
+ "2 0.280 0.284 0.321 \n",
347
+ "3 0.277 0.291 0.339 \n",
348
+ "4 0.299 0.315 0.340 \n",
349
+ ".. ... ... ... \n",
350
+ "115 0.326 0.312 0.362 \n",
351
+ "116 0.325 0.315 0.363 \n",
352
+ "117 0.329 0.308 0.364 \n",
353
+ "118 0.325 0.308 0.367 \n",
354
+ "119 0.335 0.310 0.366 \n",
355
+ "\n",
356
+ " hellaswag/acc_norm openbookqa/acc openbookqa/acc_norm ... siqa/acc \\\n",
357
+ "0 0.258 0.166 0.286 ... 0.367 \n",
358
+ "1 0.287 0.146 0.260 ... 0.365 \n",
359
+ "2 0.332 0.134 0.268 ... 0.368 \n",
360
+ "3 0.359 0.132 0.280 ... 0.394 \n",
361
+ "4 0.366 0.158 0.286 ... 0.376 \n",
362
+ ".. ... ... ... ... ... \n",
363
+ "115 0.412 0.166 0.312 ... 0.379 \n",
364
+ "116 0.409 0.162 0.312 ... 0.388 \n",
365
+ "117 0.416 0.178 0.308 ... 0.382 \n",
366
+ "118 0.425 0.174 0.312 ... 0.387 \n",
367
+ "119 0.424 0.164 0.300 ... 0.392 \n",
368
+ "\n",
369
+ " siqa/acc_norm winogrande/acc winogrande/acc_norm sciq/acc \\\n",
370
+ "0 0.362 0.516 0.497 0.210 \n",
371
+ "1 0.396 0.503 0.486 0.568 \n",
372
+ "2 0.399 0.519 0.502 0.686 \n",
373
+ "3 0.404 0.520 0.503 0.721 \n",
374
+ "4 0.399 0.515 0.500 0.739 \n",
375
+ ".. ... ... ... ... \n",
376
+ "115 0.396 0.525 0.517 0.767 \n",
377
+ "116 0.399 0.529 0.520 0.777 \n",
378
+ "117 0.398 0.521 0.510 0.770 \n",
379
+ "118 0.411 0.523 0.524 0.774 \n",
380
+ "119 0.407 0.515 0.519 0.779 \n",
381
+ "\n",
382
+ " sciq/acc_norm arc/acc arc/acc_norm mmlu/acc mmlu/acc_norm \n",
383
+ "0 0.202 0.2190 0.2515 0.230285 0.250127 \n",
384
+ "1 0.502 0.2665 0.2855 0.242526 0.253291 \n",
385
+ "2 0.590 0.3030 0.3215 0.245745 0.260988 \n",
386
+ "3 0.622 0.3210 0.3385 0.250427 0.264451 \n",
387
+ "4 0.620 0.3320 0.3445 0.256134 0.270382 \n",
388
+ ".. ... ... ... ... ... \n",
389
+ "115 0.654 0.3480 0.3560 0.262357 0.276813 \n",
390
+ "116 0.664 0.3465 0.3555 0.261599 0.276664 \n",
391
+ "117 0.656 0.3555 0.3595 0.260928 0.278411 \n",
392
+ "118 0.662 0.3570 0.3600 0.263067 0.281104 \n",
393
+ "119 0.668 0.3590 0.3565 0.261681 0.279534 \n",
394
+ "\n",
395
+ "[120 rows x 22 columns]"
396
+ ]
397
+ },
398
+ "execution_count": 6,
399
+ "metadata": {},
400
+ "output_type": "execute_result"
401
+ }
402
+ ],
403
+ "source": [
404
+ "import pandas as pd\n",
405
+ "from matplotlib.figure import Figure\n",
406
+ "\n",
407
+ "df = pd.read_csv(\"../src_data/wet_comparison.csv\")\n",
408
+ "df"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 7,
414
+ "id": "b610f43caefdf01",
415
+ "metadata": {
416
+ "ExecuteTime": {
417
+ "end_time": "2024-05-13T15:30:52.866635Z",
418
+ "start_time": "2024-05-13T15:30:52.865068Z"
419
+ },
420
+ "collapsed": false
421
+ },
422
+ "outputs": [],
423
+ "source": [
424
+ "runs_mapping = {\n",
425
+ " \"wet-extraction-2019-18\": \"WET data\",\n",
426
+ " \"ind_minhash-CC-MAIN-2019-18\": \"Extracted from WARC\",\n",
427
+ "}"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "code",
432
+ "execution_count": 9,
433
+ "id": "initial_id",
434
+ "metadata": {
435
+ "ExecuteTime": {
436
+ "end_time": "2024-05-13T15:30:53.034617Z",
437
+ "start_time": "2024-05-13T15:30:52.867342Z"
438
+ },
439
+ "collapsed": true
440
+ },
441
+ "outputs": [],
442
+ "source": [
443
+ "import json\n",
444
+ "import os\n",
445
+ "from matplotlib import pyplot as plt\n",
446
+ "metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',\n",
447
+ " 'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']\n",
448
+ "\n",
449
+ "def normalize_runname(runname):\n",
450
+ " return runname.replace(\"/\", \"_\")\n",
451
+ "\n",
452
+ "grouped = (\n",
453
+ " df.groupby([\"runname\", \"steps\"])\n",
454
+ " .agg(\n",
455
+ " {\n",
456
+ " key: \"mean\" for key in metrics\n",
457
+ " }\n",
458
+ " )\n",
459
+ " .reset_index()\n",
460
+ ")\n",
461
+ "\n",
462
+ "file_id=\"../assets/data/plots/wet_comparison\"\n",
463
+ "files = {}\n",
464
+ "for metric in metrics:\n",
465
+ " datas = {}\n",
466
+ " for name, group in grouped.groupby(\"runname\"):\n",
467
+ " if name not in runs_mapping:\n",
468
+ " continue\n",
469
+ " group = group[[\"steps\", metric]].sort_values(by=\"steps\")\n",
470
+ " group = group.set_index(\"steps\")\n",
471
+ " rolling_avg = group\n",
472
+ " # rolling_avg = group.rolling(window=5).mean()\n",
473
+ " datas[name] = {\n",
474
+ " \"x\": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),\n",
475
+ " \"y\": rolling_avg[metric].tolist(),\n",
476
+ " \"label\": runs_mapping[name],\n",
477
+ " }\n",
478
+ " # Sort the datata based on the steps\n",
479
+ " datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1][\"y\"][-1])}\n",
480
+ " # Create a folder\n",
481
+ " os.makedirs(f\"{file_id}\", exist_ok=True)\n",
482
+ " with open(f\"{file_id}/{normalize_runname(metric)}.json\", \"w\") as f:\n",
483
+ " json.dump({\n",
484
+ " \"data\": datas,\n",
485
+ " \"layout\": {\n",
486
+ " \"title\": {\n",
487
+ " \"text\": \"WET data is worse than data extracted from WARC\"\n",
488
+ " },\n",
489
+ " }\n",
490
+ " }, f)\n",
491
+ " files[metric] = {\"file\": f\"{normalize_runname(metric)}.json\"}\n",
492
+ "# Create index\n",
493
+ "with open(f\"{file_id}/index.json\", \"w\") as f:\n",
494
+ " json.dump({\n",
495
+ " \"files\": files,\n",
496
+ " \"settings\": {\n",
497
+ " \"defaultMetric\": \"agg_score\",\n",
498
+ " \"slider\":{\"min\":0,\"max\":10,\"default\":0}\n",
499
+ " }\n",
500
+ " }, f)\n",
501
+ " "
502
+ ]
503
+ },
504
+ {
505
+ "cell_type": "code",
506
+ "execution_count": 3,
507
+ "id": "af28ebbd054cdc33",
508
+ "metadata": {
509
+ "ExecuteTime": {
510
+ "end_time": "2024-05-13T15:30:53.036912Z",
511
+ "start_time": "2024-05-13T15:30:53.035519Z"
512
+ },
513
+ "collapsed": false
514
+ },
515
+ "outputs": [],
516
+ "source": []
517
+ }
518
+ ],
519
+ "metadata": {
520
+ "kernelspec": {
521
+ "display_name": "Python 3",
522
+ "language": "python",
523
+ "name": "python3"
524
+ },
525
+ "language_info": {
526
+ "codemirror_mode": {
527
+ "name": "ipython",
528
+ "version": 3
529
+ },
530
+ "file_extension": ".py",
531
+ "mimetype": "text/x-python",
532
+ "name": "python",
533
+ "nbconvert_exporter": "python",
534
+ "pygments_lexer": "ipython3",
535
+ "version": "3.12.2"
536
+ }
537
+ },
538
+ "nbformat": 4,
539
+ "nbformat_minor": 5
540
+ }