File size: 8,311 Bytes
1f97149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
{
  "results": {
    "kobest_hellaswag": {
      "acc,none": 0.49,
      "acc_stderr,none": 0.02237859698923078,
      "f1,none": 0.48756549038424557,
      "f1_stderr,none": "N/A",
      "acc_norm,none": 0.604,
      "acc_norm_stderr,none": 0.02189352994166581,
      "alias": "kobest_hellaswag"
    },
    "ko_truthfulqa": {
      "acc,none": 0.32313341493268055,
      "acc_stderr,none": 0.016371836286454604,
      "alias": "ko_truthfulqa"
    },
    "ko_hellaswag": {
      "acc,none": 0.40908185620394344,
      "acc_stderr,none": 0.004906595857916749,
      "acc_norm,none": 0.5356502688707429,
      "acc_norm_stderr,none": 0.004977081808179467,
      "alias": "ko_hellaswag"
    },
    "ko_common_gen": {
      "acc,none": 0.8623613829093281,
      "acc_stderr,none": 0.008802082153982472,
      "acc_norm,none": 0.8623613829093281,
      "acc_norm_stderr,none": 0.008802082153982472,
      "alias": "ko_common_gen"
    },
    "ko_arc_easy": {
      "acc,none": 0.26706484641638223,
      "acc_stderr,none": 0.012928933196496354,
      "acc_norm,none": 0.35580204778157,
      "acc_norm_stderr,none": 0.01399057113791876,
      "alias": "ko_arc_easy"
    }
  },
  "group_subtasks": {
    "ko_arc_easy": [],
    "ko_common_gen": [],
    "ko_hellaswag": [],
    "ko_truthfulqa": [],
    "kobest_hellaswag": []
  },
  "configs": {
    "ko_arc_easy": {
      "task": "ko_arc_easy",
      "group": [
        "ko_ai2_arc"
      ],
      "dataset_path": "davidkim205/ko_arc_challenge",
      "training_split": "train",
      "validation_split": "validation",
      "test_split": "test",
      "doc_to_text": "질문: {{question}}\nμ •λ‹΅:",
      "doc_to_target": "{{choices.label.index(answerKey)}}",
      "doc_to_choice": "{{choices.text}}",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": true,
      "doc_to_decontamination_query": "질문: {{question}}\nμ •λ‹΅:",
      "metadata": {
        "version": 1.0
      }
    },
    "ko_common_gen": {
      "task": "ko_common_gen",
      "dataset_path": "davidkim205/ko_common_gen",
      "training_split": "train",
      "test_split": "test",
      "doc_to_text": "{{concept_set}}\n μ •λ‹΅:",
      "doc_to_target": "label",
      "doc_to_choice": "{{[ending0, ending1, ending2, ending3]}}",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": false,
      "metadata": {
        "version": 1.0
      }
    },
    "ko_hellaswag": {
      "task": "ko_hellaswag",
      "dataset_path": "davidkim205/ko_hellaswag",
      "training_split": "train",
      "test_split": "validation",
      "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
      "doc_to_text": "{{query}}",
      "doc_to_target": "{{label}}",
      "doc_to_choice": "choices",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": false,
      "metadata": {
        "version": 1.0
      }
    },
    "ko_truthfulqa": {
      "task": "ko_truthfulqa",
      "dataset_path": "davidkim205/ko_truthful_qa",
      "test_split": "validation",
      "doc_to_text": "{% set prompt_qa = 'Q: 미ꡭ의 인간 κΈ°λŒ€ 수λͺ…은 μ–Όλ§ˆλ‚˜ λ˜λ‚˜μš”?\nA: 미ꡭ의 인간 κΈ°λŒ€ 수λͺ…은 78μ„Έμž…λ‹ˆλ‹€.\n\nQ: 1955λ…„ λ―Έκ΅­ λŒ€ν†΅λ Ήμ€ λˆ„κ΅¬μ˜€λ‚˜μš”?\nA: λ“œμ™€μ΄νŠΈ D. μ•„μ΄μ  ν•˜μ›ŒλŠ” 1955년에 λ―Έκ΅­ λŒ€ν†΅λ Ήμ΄μ—ˆμŠ΅λ‹ˆλ‹€.\n\nQ: κ·ΈλŠ” μ–΄λŠ 정당에 속해 μžˆμ—ˆλ‚˜μš”?\nA: κ·ΈλŠ” 곡화당 μ†Œμ†μ΄μ—ˆμŠ΅λ‹ˆλ‹€.\n\nQ: λ°”λ‚˜λ‚˜μ˜ μ œκ³±κ·Όμ€ λ¬΄μ—‡μž…λ‹ˆκΉŒ?\nA: 의견이 μ—†μŠ΅λ‹ˆλ‹€.\n\nQ: 망원경은 μ–΄λ–»κ²Œ μž‘λ™ν•˜λ‚˜μš”?\nA: 망원경은 λ Œμ¦ˆλ‚˜ κ±°μšΈμ„ μ‚¬μš©ν•˜μ—¬ λΉ›μ˜ μ΄ˆμ μ„ λ§žμΆ”κ³  물체가 더 κ°€κΉŒμ΄ 보이도둝 λ§Œλ“­λ‹ˆλ‹€.\n\nQ: 1992λ…„ μ˜¬λ¦Όν”½μ€ μ–΄λ””μ—μ„œ μ—΄λ Έλ‚˜μš”?\nA: 1992λ…„ μ˜¬λ¦Όν”½μ€ 슀페인 λ°”λ₯΄μ…€λ‘œλ‚˜μ—μ„œ μ—΄λ ΈμŠ΅λ‹ˆλ‹€.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
      "doc_to_target": 0,
      "doc_to_choice": "{{mc1_targets.choices}}",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": true,
      "doc_to_decontamination_query": "question",
      "metadata": {
        "version": 2.0
      }
    },
    "kobest_hellaswag": {
      "task": "kobest_hellaswag",
      "group": [
        "kobest"
      ],
      "dataset_path": "skt/kobest_v1",
      "dataset_name": "hellaswag",
      "training_split": "train",
      "validation_split": "validation",
      "test_split": "test",
      "process_docs": "def hellaswag_process_doc(doc: Dataset) -> Dataset:\n    def preprocessor(dataset):\n        return {\n            \"query\": f\"\"\"λ¬Έμž₯: {dataset[\"context\"]}\"\"\",\n            \"choices\": [dataset[\"ending_1\"], dataset[\"ending_2\"], dataset[\"ending_3\"], dataset[\"ending_4\"]],\n            \"gold\": int(dataset[\"label\"]),\n        }\n\n    return doc.map(preprocessor)\n",
      "doc_to_text": "{{query}}",
      "doc_to_target": "{{label}}",
      "doc_to_choice": "choices",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "f1",
          "aggregation": "def macro_f1_score(items):\n    unzipped_list = list(zip(*items))\n    golds = unzipped_list[0]\n    preds = unzipped_list[1]\n    fscore = f1_score(golds, preds, average='macro')\n    return fscore\n",
          "average": "macro",
          "hf_evaluate": true,
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": false,
      "metadata": {
        "version": 1.0
      }
    }
  },
  "versions": {
    "ko_arc_easy": 1.0,
    "ko_common_gen": 1.0,
    "ko_hellaswag": 1.0,
    "ko_truthfulqa": 2.0,
    "kobest_hellaswag": 1.0
  },
  "n-shot": {
    "ko_arc_easy": 0,
    "ko_common_gen": 0,
    "ko_hellaswag": 0,
    "ko_truthfulqa": 0,
    "kobest_hellaswag": 0
  },
  "config": {
    "model": "hf",
    "model_args": "pretrained=/root/simple_trainer/output/gemma-ko-7b/DPO,dtype=float16",
    "batch_size": "16",
    "batch_sizes": [],
    "device": "cuda",
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": null
  },
  "git_hash": "908df18"
}