Audreygyj commited on
Commit
bfe7b12
1 Parent(s): 04b995d

Upload 4 files

Browse files

add config and tokenizer

Files changed (4) hide show
  1. config.json +28 -0
  2. config.py +735 -0
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +35 -0
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "bos_token_id": 32013,
6
+ "eos_token_id": 32021,
7
+ "hidden_act": "silu",
8
+ "hidden_size": 2048,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 5504,
11
+ "max_position_embeddings": 16384,
12
+ "model_type": "llama",
13
+ "num_attention_heads": 16,
14
+ "num_hidden_layers": 24,
15
+ "num_key_value_heads": 16,
16
+ "pretraining_tp": 1,
17
+ "rms_norm_eps": 1e-06,
18
+ "rope_scaling": {
19
+ "factor": 4.0,
20
+ "type": "linear"
21
+ },
22
+ "rope_theta": 100000,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.33.1",
26
+ "use_cache": true,
27
+ "vocab_size": 32256
28
+ }
config.py ADDED
@@ -0,0 +1,735 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICE = "cuda:0"
2
+ DEBUG = False
3
+
4
+ config = {
5
+ "model": {
6
+ "codellama": {
7
+ "base_model_id": "codellama/CodeLlama-7b-hf",
8
+ "quantitize": "int8",
9
+ "dataset": "Arithmetic_Simple",
10
+ "data_collator": "DataCollatorForSeq2Seq",
11
+ "peft_config": {
12
+ "lora_config": {
13
+ "r": 16,
14
+ "lora_alpha": 16,
15
+ "target_modules": [
16
+ "q_proj",
17
+ "k_proj",
18
+ "v_proj",
19
+ "o_proj",
20
+ "gate_proj",
21
+ "up_proj",
22
+ "down_proj",
23
+ ],
24
+ "lora_dropout": 0.05,
25
+ "bias": "none",
26
+ "task_type": "CAUSAL_LM",
27
+ },
28
+ },
29
+ "training_args": {
30
+ "output_dir": "codellama-output",
31
+ "warmup_steps": 100,
32
+ "per_device_train_batch_size": 1,
33
+ "per_device_eval_batch_size": 1,
34
+ "gradient_accumulation_steps": 4,
35
+ "max_steps": 10000,
36
+ "learning_rate": 3e-4,
37
+ "optim": "adamw_torch",
38
+ "logging_dir": "codellama-output-logs",
39
+ "logging_steps": 10,
40
+ "save_strategy": "steps",
41
+ "save_steps": 500,
42
+ "load_best_model_at_end": False,
43
+ "group_by_length": True,
44
+ "fp16": True,
45
+ "evaluation_strategy": "steps",
46
+ "eval_steps": 1000,
47
+ # Uncomment this line to set a custom integration to report the results and logs to
48
+ # With transformers v4, the default value is "all"
49
+ # With transformers v5, the default value will be "none"
50
+ # "report_to": "wandb",
51
+ # Uncomment this line to set a custom run name (default ones like "eternal-brook-20"
52
+ # will be used if not set)
53
+ # "run_name": "phi2-code-finetune",
54
+ # Uncomment the following lines to trigger (Hugging Face built-in) evaluation after
55
+ # every X steps of training
56
+ # "evaluation_strategy": "steps",
57
+ # "eval_steps": 200,
58
+ # "do_eval": True,
59
+ },
60
+ "tokenizer": {
61
+ "tokenize_config": {
62
+ "truncation": True,
63
+ "max_length": 192,
64
+ "padding": "max_length",
65
+ },
66
+ "prompt_template": "config/qa_template.txt",
67
+ },
68
+ },
69
+ "phi-2": {
70
+ "base_model_id": "microsoft/phi-2",
71
+ "quantitize": "fp16",
72
+ "dataset": "Arithmetic_Simple",
73
+ "data_collator": "DataCollatorForLanguageModeling",
74
+ "peft_config":{
75
+ "lora_config": {
76
+ "r": 32,
77
+ "lora_alpha": 64,
78
+ "target_modules": [
79
+ "q_proj",
80
+ "k_proj",
81
+ "v_proj",
82
+ "dense",
83
+ "fc1",
84
+ "fc2",
85
+ ],
86
+ "bias": "none",
87
+ "lora_dropout": 0.05,
88
+ "task_type": "CAUSAL_LM",
89
+ },
90
+ },
91
+ "training_args": {
92
+ "output_dir": "phi2-output",
93
+ "warmup_steps": 500,
94
+ # fp16: ~21.5GiB VRAM; ~40h to finish
95
+ "per_device_train_batch_size": 1,
96
+ "per_device_eval_batch_size": 1,
97
+ "gradient_accumulation_steps": 4,
98
+ "max_steps": 100000,
99
+ "learning_rate": 5e-5,
100
+ "optim": "adamw_torch",
101
+ "logging_dir": "phi2-output-logs",
102
+ "logging_steps": 100,
103
+ "save_strategy": "steps",
104
+ "save_steps": 500,
105
+ "evaluation_strategy": "steps",
106
+ "eval_steps": 500,
107
+ "fp16": True,
108
+ },
109
+ "tokenizer": {
110
+ "tokenize_config": {
111
+ "truncation": True,
112
+ "max_length": 512,
113
+ "padding": "max_length",
114
+ },
115
+ "prompt_template": "config/qa_template.txt",
116
+ },
117
+ },
118
+ "deepseek": {
119
+ "base_model_id": "deepseek-ai/deepseek-coder-1.3b-instruct",
120
+ # Quantization config guidelines for DeepSeek
121
+ # If running on T4, RTX 20xx, or anything older like V100: quantitize="fp16"
122
+ # If running on L4, A100, RTX 30xx/40xx, or anything Ampere or later: quantitize="bf16"
123
+ "quantitize": "fp16",
124
+ "dataset": "Arithmetic_Hard_Third",
125
+ "data_collator": "DataCollatorForLanguageModeling",
126
+ "peft_config":{
127
+ "lora": { # trainable params = 30.0 M
128
+ "r": 32,
129
+ "lora_alpha": 64,
130
+ "target_modules": [
131
+ "q_proj",
132
+ "k_proj",
133
+ "v_proj",
134
+ "o_proj",
135
+ "gate_proj",
136
+ "up_proj",
137
+ "down_proj",
138
+ ],
139
+ "bias": "none",
140
+ "lora_dropout": 0.05,
141
+ "task_type": "CAUSAL_LM",
142
+ },
143
+ "lora_large": { # trainable params = not checked yet
144
+ "r": 128,
145
+ "lora_alpha": 256,
146
+ "target_modules": [
147
+ "q_proj",
148
+ "k_proj",
149
+ "v_proj",
150
+ "o_proj",
151
+ "gate_proj",
152
+ "up_proj",
153
+ "down_proj",
154
+ ],
155
+ "bias": "none",
156
+ "lora_dropout": 0.05,
157
+ "task_type": "CAUSAL_LM",
158
+ },
159
+ },
160
+ "p_tuning_config": { # Doesn't work, PEFT interface issues
161
+ "num_virtual_tokens": 16,
162
+ "num_transformer_submodules": 1,
163
+ "token_dim": 2048, # NOTE(Shih-Lun): should change w/ base LLM
164
+ "encoder_hidden_size": 2048,
165
+ "task_type": "CAUSAL_LM",
166
+ },
167
+ "training_args": {
168
+ "warmup_steps": 500,
169
+ # bf16: ~21.0GiB VRAM; ~21h to finish
170
+ "per_device_train_batch_size": 4,
171
+ "per_device_eval_batch_size": 4,
172
+ "gradient_accumulation_steps": 1,
173
+ "max_steps": 35000,
174
+ "learning_rate": 2e-5,
175
+ "optim": "adamw_torch",
176
+ "logging_steps": 100//10,
177
+ "save_strategy": "steps",
178
+ "save_total_limit": 10,
179
+ "save_steps": 500,
180
+ "evaluation_strategy": "steps",
181
+ "eval_steps": 500,
182
+ "weight_decay": 0.01,
183
+ "report_to": "wandb",
184
+ "dataloader_num_workers": 4,
185
+ "load_best_model_at_end": True,
186
+ # fp16/bf16 config guidelines for DeepSeek
187
+ # If running full tuning: Don't set fp16/bf16
188
+ # If running LoRA: Match the quantization setting (either fp16=True or bf16=True)
189
+ "fp16": True,
190
+ },
191
+ "tokenizer": {
192
+ "tokenize_config": {
193
+ "truncation": True,
194
+ "max_length": 512,
195
+ "padding": "max_length",
196
+ },
197
+ "prompt_template": "config/qa_template.txt",
198
+ },
199
+ },
200
+ },
201
+ "dataset": {
202
+ "simple_dataset": {
203
+ "type": "huggingface", # Public datasets on the Hugging Face Hub (only for testing)
204
+ "dataset_purpose": "downstream",
205
+ "name": "b-mc2/sql-create-context",
206
+ "train_split": 0.9,
207
+ "max_train_size": 100,
208
+ "filling_field": ["question", "context", "answer"],
209
+ },
210
+ "testdset": {
211
+ "type": "local", # Local files
212
+ "dataset_purpose": "downstream",
213
+ "train_file": "data/Test/TestDataset.json",
214
+ "val_file": "data/Test/TestDataset.json",
215
+ "test_file": "data/Test/TestDataset.json",
216
+ "filling_field": ["prompted_question", "answer"],
217
+ },
218
+ "deepmind_mathematics": {
219
+ "type": "local",
220
+ "dataset_purpose": "downstream",
221
+ "filling_field": ["question", "answer"],
222
+ "epoch_length":{
223
+ "train": 54*4*50, # Module * BatchSize * lesson_num in each module
224
+ "val": 54*4*1,
225
+ },
226
+ },
227
+ "mixture_codegen": {
228
+ "filling_field": ["Question", "Answer"],
229
+ "dataset_purpose": "downstream",
230
+ },
231
+ "Arithmetic_Hard_prompt_C12_with_gh": {
232
+ "filling_field": ["Question", "Answer"],
233
+ "dataset_purpose": "downstream",
234
+ },
235
+ "MathQA_Python_loader": {
236
+ "type": "list-like", # List-like objects (we're going to use this for ablations)
237
+ "dataset_purpose": "downstream",
238
+ "train": "data/MathQA_Python_processed/mathqa_python_train_clean_final.json",
239
+ "val": "data/MathQA_Python_processed/mathqa_python_dev_clean_final.json",
240
+ "test": "data/MathQA_Python_processed/mathqa_python_test_clean_final.json",
241
+ "filling_field": ["Question", "Answer"],
242
+ },
243
+ "APPS_loader": {
244
+ "type": "list-like", # List-like objects (we're going to use this for ablations)
245
+ "dataset_purpose": "downstream",
246
+ "train": "data/APPS/apps_train.json",
247
+ "val": "data/APPS/apps_dev.json",
248
+ "test": "data/APPS/test/apps_test_75.json",
249
+ "filling_field": ["Question", "Answer"],
250
+ },
251
+ "MBPP_loader": {
252
+ "type": "list-like",
253
+ "dataset_purpose": "downstream",
254
+ "train": "data/MBPP/mbpp_train.json",
255
+ "val": "data/MBPP/mbpp_dev.json",
256
+ "test": "data/MBPP/mbpp_test.json",
257
+ "filling_field": ["Question", "Answer"],
258
+ },
259
+ "Arithmetic_Simple": {
260
+ "type": "list-like",
261
+ "dataset_purpose": "downstream",
262
+ "attributes": {
263
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
264
+ "lessons": [
265
+ "Max_Ops1_Bounds0_100",
266
+ "Max_Ops1_Bounds0_1000",
267
+ "Max_Ops2_Bounds0_100",
268
+ "Max_Ops2_Bounds0_1000",
269
+ "Max_Ops3_Bounds0_100",
270
+ # "Max_Ops3_Bounds0_1000",
271
+ # "Max_Ops4_Bounds0_100",
272
+ # "Max_Ops4_Bounds0_1000",
273
+ # "Max_Ops5_Bounds0_100",
274
+ # "Max_Ops5_Bounds0_1000",
275
+ ]
276
+ },
277
+ "train": "data/Arithmetic/Curriculum_Simple",
278
+ "val": "data/Arithmetic/Curriculum_Simple",
279
+ "test": "data/Arithmetic/Curriculum_Simple",
280
+ "filling_field": ["Question", "Answer"],
281
+ },
282
+ "Arithmetic_Simple_First_Half": {
283
+ "type": "list-like",
284
+ "dataset_purpose": "downstream",
285
+ "attributes": {
286
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
287
+ "lessons": [
288
+ "Max_Ops1_Bounds0_100",
289
+ "Max_Ops1_Bounds0_1000",
290
+ "Max_Ops2_Bounds0_100",
291
+ "Max_Ops2_Bounds0_1000",
292
+ "Max_Ops3_Bounds0_100",
293
+ ]
294
+ },
295
+ "train": "data/Arithmetic/Curriculum_Simple",
296
+ "val": "data/Arithmetic/Curriculum_Simple",
297
+ "test": "data/Arithmetic/Curriculum_Simple",
298
+ "filling_field": ["Question", "Answer"],
299
+ },
300
+ "Arithmetic_Simple_Second_Half": {
301
+ "type": "list-like",
302
+ "dataset_purpose": "downstream",
303
+ "attributes": {
304
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
305
+ "lessons": [
306
+ "Max_Ops3_Bounds0_1000",
307
+ "Max_Ops4_Bounds0_100",
308
+ "Max_Ops4_Bounds0_1000",
309
+ "Max_Ops5_Bounds0_100",
310
+ "Max_Ops5_Bounds0_1000",
311
+ ]
312
+ },
313
+ "train": "data/Arithmetic/Curriculum_Simple",
314
+ "val": "data/Arithmetic/Curriculum_Simple",
315
+ "test": "data/Arithmetic/Curriculum_Simple",
316
+ "filling_field": ["Question", "Answer"],
317
+ },
318
+ "Arithmetic_Hard": {
319
+ "type": "list-like",
320
+ "dataset_purpose": "downstream",
321
+ "attributes": {
322
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
323
+ "lessons": [
324
+ "Max_Ops1_Bounds-1000_1000",
325
+ "Max_Ops1_Bounds-100_100",
326
+ "Max_Ops1_Bounds0_100",
327
+ "Max_Ops1_Bounds0_1000",
328
+ "Max_Ops2_Bounds-1000_1000",
329
+ "Max_Ops2_Bounds-100_100",
330
+ "Max_Ops2_Bounds0_100",
331
+ "Max_Ops2_Bounds0_1000",
332
+ "Max_Ops3_Bounds-1000_1000",
333
+ "Max_Ops3_Bounds-100_100",
334
+ "Max_Ops3_Bounds0_100",
335
+ "Max_Ops3_Bounds0_1000",
336
+ "Max_Ops4_Bounds-1000_1000",
337
+ "Max_Ops4_Bounds-100_100",
338
+ "Max_Ops4_Bounds0_100",
339
+ "Max_Ops4_Bounds0_1000",
340
+ "Max_Ops5_Bounds-1000_1000",
341
+ "Max_Ops5_Bounds-100_100",
342
+ "Max_Ops5_Bounds0_100",
343
+ "Max_Ops5_Bounds0_1000",
344
+ "Max_Ops6_Bounds-1000_1000",
345
+ "Max_Ops6_Bounds-100_100",
346
+ "Max_Ops6_Bounds0_100",
347
+ "Max_Ops6_Bounds0_1000",
348
+ "Max_Ops7_Bounds-1000_1000",
349
+ "Max_Ops7_Bounds-100_100",
350
+ "Max_Ops7_Bounds0_100",
351
+ "Max_Ops7_Bounds0_1000",
352
+ "Max_Ops8_Bounds-1000_1000",
353
+ "Max_Ops8_Bounds-100_100",
354
+ "Max_Ops8_Bounds0_100",
355
+ "Max_Ops8_Bounds0_1000",
356
+ "Max_Ops9_Bounds-1000_1000",
357
+ "Max_Ops9_Bounds-100_100",
358
+ "Max_Ops9_Bounds0_100",
359
+ "Max_Ops9_Bounds0_1000",
360
+ "Max_Ops10_Bounds-1000_1000",
361
+ "Max_Ops10_Bounds-100_100",
362
+ "Max_Ops10_Bounds0_100",
363
+ "Max_Ops10_Bounds0_1000",
364
+ ]
365
+ },
366
+ "train": "data/Arithmetic/Curriculum_Hard",
367
+ "val": "data/Arithmetic/Curriculum_Hard",
368
+ "test": "data/Arithmetic/Curriculum_Hard",
369
+ "filling_field": ["Question", "Answer"],
370
+ },
371
+ "Arithmetic_Hard_First": {
372
+ "type": "list-like",
373
+ "dataset_purpose": "downstream",
374
+ "attributes": {
375
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
376
+ "lessons": [
377
+ "Max_Ops1_Bounds-1000_1000",
378
+ "Max_Ops1_Bounds-100_100",
379
+ "Max_Ops1_Bounds0_100",
380
+ "Max_Ops1_Bounds0_1000",
381
+ "Max_Ops2_Bounds-1000_1000",
382
+ "Max_Ops2_Bounds-100_100",
383
+ "Max_Ops2_Bounds0_100",
384
+ "Max_Ops2_Bounds0_1000",
385
+ "Max_Ops3_Bounds-1000_1000",
386
+ "Max_Ops3_Bounds-100_100",
387
+ "Max_Ops3_Bounds0_100",
388
+ "Max_Ops3_Bounds0_1000",
389
+ "Max_Ops4_Bounds-1000_1000",
390
+ "Max_Ops4_Bounds-100_100",
391
+ "Max_Ops4_Bounds0_100",
392
+ "Max_Ops4_Bounds0_1000",
393
+ ]
394
+ },
395
+ "train": "data/Arithmetic/Curriculum_Hard",
396
+ "val": "data/Arithmetic/Curriculum_Hard",
397
+ "test": "data/Arithmetic/Curriculum_Hard",
398
+ "filling_field": ["Question", "Answer"],
399
+ },
400
+ "Arithmetic_Hard_Second": {
401
+ "type": "list-like",
402
+ "dataset_purpose": "downstream",
403
+ "attributes": {
404
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
405
+ "lessons": [
406
+ "Max_Ops5_Bounds-1000_1000",
407
+ "Max_Ops5_Bounds-100_100",
408
+ "Max_Ops5_Bounds0_100",
409
+ "Max_Ops5_Bounds0_1000",
410
+ "Max_Ops6_Bounds-1000_1000",
411
+ "Max_Ops6_Bounds-100_100",
412
+ "Max_Ops6_Bounds0_100",
413
+ "Max_Ops6_Bounds0_1000",
414
+ "Max_Ops7_Bounds-1000_1000",
415
+ "Max_Ops7_Bounds-100_100",
416
+ "Max_Ops7_Bounds0_100",
417
+ "Max_Ops7_Bounds0_1000",
418
+ ]
419
+ },
420
+ "train": "data/Arithmetic/Curriculum_Hard",
421
+ "val": "data/Arithmetic/Curriculum_Hard",
422
+ "test": "data/Arithmetic/Curriculum_Hard",
423
+ "filling_field": ["Question", "Answer"],
424
+ },
425
+ "Arithmetic_Hard_Third": {
426
+ "type": "list-like",
427
+ "dataset_purpose": "downstream",
428
+ "attributes": {
429
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
430
+ "lessons": [
431
+ "Max_Ops8_Bounds-1000_1000",
432
+ "Max_Ops8_Bounds-100_100",
433
+ "Max_Ops8_Bounds0_100",
434
+ "Max_Ops8_Bounds0_1000",
435
+ "Max_Ops9_Bounds-1000_1000",
436
+ "Max_Ops9_Bounds-100_100",
437
+ "Max_Ops9_Bounds0_100",
438
+ "Max_Ops9_Bounds0_1000",
439
+ "Max_Ops10_Bounds-1000_1000",
440
+ "Max_Ops10_Bounds-100_100",
441
+ "Max_Ops10_Bounds0_100",
442
+ "Max_Ops10_Bounds0_1000",
443
+ ]
444
+ },
445
+ "train": "data/Arithmetic/Curriculum_Hard",
446
+ "val": "data/Arithmetic/Curriculum_Hard",
447
+ "test": "data/Arithmetic/Curriculum_Hard",
448
+ "filling_field": ["Question", "Answer"],
449
+ },
450
+ "Arithmetic_Hard_prompt_C11": {
451
+ "type": "list-like",
452
+ "dataset_purpose": "downstream",
453
+ "attributes": {
454
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
455
+ "lessons": [
456
+ "Max_Ops1_Bounds-1000_1000",
457
+ "Max_Ops1_Bounds-100_100",
458
+ "Max_Ops1_Bounds0_100",
459
+ "Max_Ops1_Bounds0_1000",
460
+ "Max_Ops2_Bounds-1000_1000",
461
+ "Max_Ops2_Bounds-100_100",
462
+ "Max_Ops2_Bounds0_100",
463
+ "Max_Ops2_Bounds0_1000",
464
+ "Max_Ops3_Bounds-1000_1000",
465
+ "Max_Ops3_Bounds-100_100",
466
+ "Max_Ops3_Bounds0_100",
467
+ "Max_Ops3_Bounds0_1000",
468
+ "Max_Ops4_Bounds-1000_1000",
469
+ "Max_Ops4_Bounds-100_100",
470
+ "Max_Ops4_Bounds0_100",
471
+ "Max_Ops4_Bounds0_1000",
472
+ "Max_Ops5_Bounds-1000_1000",
473
+ "Max_Ops5_Bounds-100_100",
474
+ "Max_Ops5_Bounds0_100",
475
+ "Max_Ops5_Bounds0_1000",
476
+ "Max_Ops6_Bounds-1000_1000",
477
+ "Max_Ops6_Bounds-100_100",
478
+ "Max_Ops6_Bounds0_100",
479
+ "Max_Ops6_Bounds0_1000",
480
+ "Max_Ops7_Bounds-1000_1000",
481
+ "Max_Ops7_Bounds-100_100",
482
+ "Max_Ops7_Bounds0_100",
483
+ "Max_Ops7_Bounds0_1000",
484
+ "Max_Ops8_Bounds-1000_1000",
485
+ "Max_Ops8_Bounds-100_100",
486
+ "Max_Ops8_Bounds0_100",
487
+ "Max_Ops8_Bounds0_1000",
488
+ "Max_Ops9_Bounds-1000_1000",
489
+ "Max_Ops9_Bounds-100_100",
490
+ "Max_Ops9_Bounds0_100",
491
+ "Max_Ops9_Bounds0_1000",
492
+ "Max_Ops10_Bounds-1000_1000",
493
+ "Max_Ops10_Bounds-100_100",
494
+ "Max_Ops10_Bounds0_100",
495
+ "Max_Ops10_Bounds0_1000",
496
+ ]
497
+ },
498
+ "train": "data/Arithmetic/Curriculum_Hard",
499
+ "val": "data/Arithmetic/Curriculum_Hard",
500
+ "test": "data/Arithmetic/Curriculum_Hard",
501
+ "filling_field": ["Question", "Answer"],
502
+ },
503
+ "Arithmetic_Hard_prompt_C12": {
504
+ "type": "list-like",
505
+ "dataset_purpose": "downstream",
506
+ "attributes": {
507
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
508
+ "lessons": [
509
+ "Max_Ops1_Bounds-1000_1000",
510
+ "Max_Ops1_Bounds-100_100",
511
+ "Max_Ops1_Bounds0_100",
512
+ "Max_Ops1_Bounds0_1000",
513
+ "Max_Ops2_Bounds-1000_1000",
514
+ "Max_Ops2_Bounds-100_100",
515
+ "Max_Ops2_Bounds0_100",
516
+ "Max_Ops2_Bounds0_1000",
517
+ "Max_Ops3_Bounds-1000_1000",
518
+ "Max_Ops3_Bounds-100_100",
519
+ "Max_Ops3_Bounds0_100",
520
+ "Max_Ops3_Bounds0_1000",
521
+ "Max_Ops4_Bounds-1000_1000",
522
+ "Max_Ops4_Bounds-100_100",
523
+ "Max_Ops4_Bounds0_100",
524
+ "Max_Ops4_Bounds0_1000",
525
+ "Max_Ops5_Bounds-1000_1000",
526
+ "Max_Ops5_Bounds-100_100",
527
+ "Max_Ops5_Bounds0_100",
528
+ "Max_Ops5_Bounds0_1000",
529
+ "Max_Ops6_Bounds-1000_1000",
530
+ "Max_Ops6_Bounds-100_100",
531
+ "Max_Ops6_Bounds0_100",
532
+ "Max_Ops6_Bounds0_1000",
533
+ "Max_Ops7_Bounds-1000_1000",
534
+ "Max_Ops7_Bounds-100_100",
535
+ "Max_Ops7_Bounds0_100",
536
+ "Max_Ops7_Bounds0_1000",
537
+ "Max_Ops8_Bounds-1000_1000",
538
+ "Max_Ops8_Bounds-100_100",
539
+ "Max_Ops8_Bounds0_100",
540
+ "Max_Ops8_Bounds0_1000",
541
+ "Max_Ops9_Bounds-1000_1000",
542
+ "Max_Ops9_Bounds-100_100",
543
+ "Max_Ops9_Bounds0_100",
544
+ "Max_Ops9_Bounds0_1000",
545
+ "Max_Ops10_Bounds-1000_1000",
546
+ "Max_Ops10_Bounds-100_100",
547
+ "Max_Ops10_Bounds0_100",
548
+ "Max_Ops10_Bounds0_1000",
549
+ ]
550
+ },
551
+ "train": "data/Arithmetic/Curriculum_Hard",
552
+ "val": "data/Arithmetic/Curriculum_Hard",
553
+ "test": "data/Arithmetic/Curriculum_Hard",
554
+ "filling_field": ["Question", "Answer"],
555
+ },
556
+ "Arithmetic_Hard_prompt_C12_intermediate": {
557
+ "type": "list-like",
558
+ "dataset_purpose": "downstream",
559
+ "attributes": {
560
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
561
+ "lessons": [
562
+ "Max_Ops1_Bounds-1000_1000",
563
+ "Max_Ops1_Bounds-100_100",
564
+ "Max_Ops1_Bounds0_100",
565
+ "Max_Ops1_Bounds0_1000",
566
+ "Max_Ops2_Bounds-1000_1000",
567
+ "Max_Ops2_Bounds-100_100",
568
+ "Max_Ops2_Bounds0_100",
569
+ "Max_Ops2_Bounds0_1000",
570
+ "Max_Ops3_Bounds-1000_1000",
571
+ "Max_Ops3_Bounds-100_100",
572
+ "Max_Ops3_Bounds0_100",
573
+ "Max_Ops3_Bounds0_1000",
574
+ "Max_Ops4_Bounds-1000_1000",
575
+ "Max_Ops4_Bounds-100_100",
576
+ "Max_Ops4_Bounds0_100",
577
+ "Max_Ops4_Bounds0_1000",
578
+ "Max_Ops5_Bounds-1000_1000",
579
+ "Max_Ops5_Bounds-100_100",
580
+ "Max_Ops5_Bounds0_100",
581
+ "Max_Ops5_Bounds0_1000",
582
+ "Max_Ops6_Bounds-1000_1000",
583
+ "Max_Ops6_Bounds-100_100",
584
+ "Max_Ops6_Bounds0_100",
585
+ "Max_Ops6_Bounds0_1000",
586
+ "Max_Ops7_Bounds-1000_1000",
587
+ "Max_Ops7_Bounds-100_100",
588
+ "Max_Ops7_Bounds0_100",
589
+ "Max_Ops7_Bounds0_1000",
590
+ "Max_Ops8_Bounds-1000_1000",
591
+ "Max_Ops8_Bounds-100_100",
592
+ "Max_Ops8_Bounds0_100",
593
+ "Max_Ops8_Bounds0_1000",
594
+ "Max_Ops9_Bounds-1000_1000",
595
+ "Max_Ops9_Bounds-100_100",
596
+ "Max_Ops9_Bounds0_100",
597
+ "Max_Ops9_Bounds0_1000",
598
+ "Max_Ops10_Bounds-1000_1000",
599
+ "Max_Ops10_Bounds-100_100",
600
+ "Max_Ops10_Bounds0_100",
601
+ "Max_Ops10_Bounds0_1000",
602
+ ]
603
+ },
604
+ "train": "data/Arithmetic/Curriculum_Hard",
605
+ "val": "data/Arithmetic/Curriculum_Hard",
606
+ "test": "data/Arithmetic/Curriculum_Hard",
607
+ "filling_field": ["Question", "Answer"],
608
+ },
609
+ "Arithmetic_XHard": {
610
+ "type": "list-like",
611
+ "dataset_purpose": "downstream",
612
+ "attributes": {
613
+ "subjects": [1, 2, 3, 4, 5, 6, 7, 8, 9],
614
+ "lessons": [
615
+ "Max_Ops10_Bounds0_10000.json",
616
+ "Max_Ops10_Bounds0_1000.json",
617
+ "Max_Ops10_Bounds-10000_10000.json",
618
+ "Max_Ops10_Bounds-1000_1000.json",
619
+ "Max_Ops11_Bounds0_10000.json",
620
+ "Max_Ops11_Bounds0_1000.json",
621
+ "Max_Ops11_Bounds-10000_10000.json",
622
+ "Max_Ops11_Bounds-1000_1000.json",
623
+ "Max_Ops12_Bounds0_10000.json",
624
+ "Max_Ops12_Bounds0_1000.json",
625
+ "Max_Ops12_Bounds-10000_10000.json",
626
+ "Max_Ops12_Bounds-1000_1000.json",
627
+ "Max_Ops13_Bounds0_10000.json",
628
+ "Max_Ops13_Bounds0_1000.json",
629
+ "Max_Ops13_Bounds-10000_10000.json",
630
+ "Max_Ops13_Bounds-1000_1000.json",
631
+ "Max_Ops14_Bounds0_10000.json",
632
+ "Max_Ops14_Bounds0_1000.json",
633
+ "Max_Ops14_Bounds-10000_10000.json",
634
+ "Max_Ops14_Bounds-1000_1000.json",
635
+ "Max_Ops15_Bounds0_10000.json",
636
+ "Max_Ops15_Bounds0_1000.json",
637
+ "Max_Ops15_Bounds-10000_10000.json",
638
+ "Max_Ops15_Bounds-1000_1000.json",
639
+ "Max_Ops16_Bounds0_10000.json",
640
+ "Max_Ops16_Bounds0_1000.json",
641
+ "Max_Ops16_Bounds-10000_10000.json",
642
+ "Max_Ops16_Bounds-1000_1000.json",
643
+ "Max_Ops17_Bounds0_10000.json",
644
+ "Max_Ops17_Bounds0_1000.json",
645
+ "Max_Ops17_Bounds-10000_10000.json",
646
+ "Max_Ops17_Bounds-1000_1000.json",
647
+ "Max_Ops18_Bounds0_10000.json",
648
+ "Max_Ops18_Bounds0_1000.json",
649
+ "Max_Ops18_Bounds-10000_10000.json",
650
+ "Max_Ops18_Bounds-1000_1000.json",
651
+ "Max_Ops19_Bounds0_10000.json",
652
+ "Max_Ops19_Bounds0_1000.json",
653
+ "Max_Ops19_Bounds-10000_10000.json",
654
+ "Max_Ops19_Bounds-1000_1000.json",
655
+ "Max_Ops1_Bounds0_10000.json",
656
+ "Max_Ops1_Bounds0_1000.json",
657
+ "Max_Ops1_Bounds-10000_10000.json",
658
+ "Max_Ops1_Bounds-1000_1000.json",
659
+ "Max_Ops20_Bounds0_10000.json",
660
+ "Max_Ops20_Bounds0_1000.json",
661
+ "Max_Ops20_Bounds-10000_10000.json",
662
+ "Max_Ops20_Bounds-1000_1000.json",
663
+ "Max_Ops2_Bounds0_10000.json",
664
+ "Max_Ops2_Bounds0_1000.json",
665
+ "Max_Ops2_Bounds-10000_10000.json",
666
+ "Max_Ops2_Bounds-1000_1000.json",
667
+ "Max_Ops3_Bounds0_10000.json",
668
+ "Max_Ops3_Bounds0_1000.json",
669
+ "Max_Ops3_Bounds-10000_10000.json",
670
+ "Max_Ops3_Bounds-1000_1000.json",
671
+ "Max_Ops4_Bounds0_10000.json",
672
+ "Max_Ops4_Bounds0_1000.json",
673
+ "Max_Ops4_Bounds-10000_10000.json",
674
+ "Max_Ops4_Bounds-1000_1000.json",
675
+ "Max_Ops5_Bounds0_10000.json",
676
+ "Max_Ops5_Bounds0_1000.json",
677
+ "Max_Ops5_Bounds-10000_10000.json",
678
+ "Max_Ops5_Bounds-1000_1000.json",
679
+ "Max_Ops6_Bounds0_10000.json",
680
+ "Max_Ops6_Bounds0_1000.json",
681
+ "Max_Ops6_Bounds-10000_10000.json",
682
+ "Max_Ops6_Bounds-1000_1000.json",
683
+ "Max_Ops7_Bounds0_10000.json",
684
+ "Max_Ops7_Bounds0_1000.json",
685
+ "Max_Ops7_Bounds-10000_10000.json",
686
+ "Max_Ops7_Bounds-1000_1000.json",
687
+ "Max_Ops8_Bounds0_10000.json",
688
+ "Max_Ops8_Bounds0_1000.json",
689
+ "Max_Ops8_Bounds-10000_10000.json",
690
+ "Max_Ops8_Bounds-1000_1000.json",
691
+ "Max_Ops9_Bounds0_10000.json",
692
+ "Max_Ops9_Bounds0_1000.json",
693
+ "Max_Ops9_Bounds-10000_10000.json",
694
+ "Max_Ops9_Bounds-1000_1000.json",
695
+ ]
696
+ },
697
+ "train": "data/Arithmetic/Curriculum_XHard",
698
+ "val": "data/Arithmetic/Curriculum_XHard",
699
+ "test": "data/Arithmetic/Curriculum_XHard",
700
+ "filling_field": ["Question", "Answer"],
701
+ },
702
+ "GSM8K": {
703
+ "type": "local",
704
+ "dataset_purpose": "downstream",
705
+ "train_file": "data/GSM8K/GSM8K_train.json",
706
+ "val_file": "data/GSM8K/GSM8K_test.json",
707
+ "test_file": "data/GSM8K/GSM8K_dev.json",
708
+ "filling_field": ["Body", "Question", "Answer"],
709
+ },
710
+ "APPS": {
711
+ "type": "local",
712
+ "dataset_purpose": "downstream",
713
+ "train_file": "data/APPS/apps_train.json",
714
+ "val_file": "data/APPS/apps_test.json",
715
+ "test_file": "data/APPS/apps_dev.json",
716
+ "filling_field": ["Body", "Question", "Answer"],
717
+ },
718
+ "ghcode_python": {
719
+ "type": "huggingface",
720
+ "dataset_purpose": "pretrain",
721
+ "name": "slseanwu/ghcode_python_split_700k",
722
+ "max_eval_size": 1000,
723
+ "max_train_size": 160000,
724
+ "filling_field": ["code"],
725
+ },
726
+ },
727
+ }
728
+
729
+
730
+ if DEBUG:
731
+ config.epochs = 100
732
+ config.save_steps = 10
733
+ config.train_dataset = "local-test-train"
734
+ config.val_dataset = "local-test-dev"
735
+ config.test_dataset = "test-clean"
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|begin▁of▁sentence|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|EOT|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "legacy": true,
22
+ "model_max_length": 16384,
23
+ "pad_token": {
24
+ "__type": "AddedToken",
25
+ "content": "<|end▁of▁sentence|>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "sp_model_kwargs": {},
32
+ "unk_token": null,
33
+ "tokenizer_class": "LlamaTokenizerFast",
34
+ "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
35
+ }