gemma-ko-7b-v1.0 / results.json
hyeogi's picture
init
1f97149
{
"results": {
"kobest_hellaswag": {
"acc,none": 0.49,
"acc_stderr,none": 0.02237859698923078,
"f1,none": 0.48756549038424557,
"f1_stderr,none": "N/A",
"acc_norm,none": 0.604,
"acc_norm_stderr,none": 0.02189352994166581,
"alias": "kobest_hellaswag"
},
"ko_truthfulqa": {
"acc,none": 0.32313341493268055,
"acc_stderr,none": 0.016371836286454604,
"alias": "ko_truthfulqa"
},
"ko_hellaswag": {
"acc,none": 0.40908185620394344,
"acc_stderr,none": 0.004906595857916749,
"acc_norm,none": 0.5356502688707429,
"acc_norm_stderr,none": 0.004977081808179467,
"alias": "ko_hellaswag"
},
"ko_common_gen": {
"acc,none": 0.8623613829093281,
"acc_stderr,none": 0.008802082153982472,
"acc_norm,none": 0.8623613829093281,
"acc_norm_stderr,none": 0.008802082153982472,
"alias": "ko_common_gen"
},
"ko_arc_easy": {
"acc,none": 0.26706484641638223,
"acc_stderr,none": 0.012928933196496354,
"acc_norm,none": 0.35580204778157,
"acc_norm_stderr,none": 0.01399057113791876,
"alias": "ko_arc_easy"
}
},
"group_subtasks": {
"ko_arc_easy": [],
"ko_common_gen": [],
"ko_hellaswag": [],
"ko_truthfulqa": [],
"kobest_hellaswag": []
},
"configs": {
"ko_arc_easy": {
"task": "ko_arc_easy",
"group": [
"ko_ai2_arc"
],
"dataset_path": "davidkim205/ko_arc_challenge",
"training_split": "train",
"validation_split": "validation",
"test_split": "test",
"doc_to_text": "질문: {{question}}\nμ •λ‹΅:",
"doc_to_target": "{{choices.label.index(answerKey)}}",
"doc_to_choice": "{{choices.text}}",
"description": "",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"num_fewshot": 0,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
},
{
"metric": "acc_norm",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": true,
"doc_to_decontamination_query": "질문: {{question}}\nμ •λ‹΅:",
"metadata": {
"version": 1.0
}
},
"ko_common_gen": {
"task": "ko_common_gen",
"dataset_path": "davidkim205/ko_common_gen",
"training_split": "train",
"test_split": "test",
"doc_to_text": "{{concept_set}}\n μ •λ‹΅:",
"doc_to_target": "label",
"doc_to_choice": "{{[ending0, ending1, ending2, ending3]}}",
"description": "",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"num_fewshot": 0,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
},
{
"metric": "acc_norm",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 1.0
}
},
"ko_hellaswag": {
"task": "ko_hellaswag",
"dataset_path": "davidkim205/ko_hellaswag",
"training_split": "train",
"test_split": "validation",
"process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
"doc_to_text": "{{query}}",
"doc_to_target": "{{label}}",
"doc_to_choice": "choices",
"description": "",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"num_fewshot": 0,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
},
{
"metric": "acc_norm",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 1.0
}
},
"ko_truthfulqa": {
"task": "ko_truthfulqa",
"dataset_path": "davidkim205/ko_truthful_qa",
"test_split": "validation",
"doc_to_text": "{% set prompt_qa = 'Q: 미ꡭ의 인간 κΈ°λŒ€ 수λͺ…은 μ–Όλ§ˆλ‚˜ λ˜λ‚˜μš”?\nA: 미ꡭ의 인간 κΈ°λŒ€ 수λͺ…은 78μ„Έμž…λ‹ˆλ‹€.\n\nQ: 1955λ…„ λ―Έκ΅­ λŒ€ν†΅λ Ήμ€ λˆ„κ΅¬μ˜€λ‚˜μš”?\nA: λ“œμ™€μ΄νŠΈ D. μ•„μ΄μ  ν•˜μ›ŒλŠ” 1955년에 λ―Έκ΅­ λŒ€ν†΅λ Ήμ΄μ—ˆμŠ΅λ‹ˆλ‹€.\n\nQ: κ·ΈλŠ” μ–΄λŠ 정당에 속해 μžˆμ—ˆλ‚˜μš”?\nA: κ·ΈλŠ” 곡화당 μ†Œμ†μ΄μ—ˆμŠ΅λ‹ˆλ‹€.\n\nQ: λ°”λ‚˜λ‚˜μ˜ μ œκ³±κ·Όμ€ λ¬΄μ—‡μž…λ‹ˆκΉŒ?\nA: 의견이 μ—†μŠ΅λ‹ˆλ‹€.\n\nQ: 망원경은 μ–΄λ–»κ²Œ μž‘λ™ν•˜λ‚˜μš”?\nA: 망원경은 λ Œμ¦ˆλ‚˜ κ±°μšΈμ„ μ‚¬μš©ν•˜μ—¬ λΉ›μ˜ μ΄ˆμ μ„ λ§žμΆ”κ³  물체가 더 κ°€κΉŒμ΄ 보이도둝 λ§Œλ“­λ‹ˆλ‹€.\n\nQ: 1992λ…„ μ˜¬λ¦Όν”½μ€ μ–΄λ””μ—μ„œ μ—΄λ Έλ‚˜μš”?\nA: 1992λ…„ μ˜¬λ¦Όν”½μ€ 슀페인 λ°”λ₯΄μ…€λ‘œλ‚˜μ—μ„œ μ—΄λ ΈμŠ΅λ‹ˆλ‹€.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
"doc_to_target": 0,
"doc_to_choice": "{{mc1_targets.choices}}",
"description": "",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"num_fewshot": 0,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": true,
"doc_to_decontamination_query": "question",
"metadata": {
"version": 2.0
}
},
"kobest_hellaswag": {
"task": "kobest_hellaswag",
"group": [
"kobest"
],
"dataset_path": "skt/kobest_v1",
"dataset_name": "hellaswag",
"training_split": "train",
"validation_split": "validation",
"test_split": "test",
"process_docs": "def hellaswag_process_doc(doc: Dataset) -> Dataset:\n def preprocessor(dataset):\n return {\n \"query\": f\"\"\"λ¬Έμž₯: {dataset[\"context\"]}\"\"\",\n \"choices\": [dataset[\"ending_1\"], dataset[\"ending_2\"], dataset[\"ending_3\"], dataset[\"ending_4\"]],\n \"gold\": int(dataset[\"label\"]),\n }\n\n return doc.map(preprocessor)\n",
"doc_to_text": "{{query}}",
"doc_to_target": "{{label}}",
"doc_to_choice": "choices",
"description": "",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"num_fewshot": 0,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
},
{
"metric": "acc_norm",
"aggregation": "mean",
"higher_is_better": true
},
{
"metric": "f1",
"aggregation": "def macro_f1_score(items):\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average='macro')\n return fscore\n",
"average": "macro",
"hf_evaluate": true,
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 1.0
}
}
},
"versions": {
"ko_arc_easy": 1.0,
"ko_common_gen": 1.0,
"ko_hellaswag": 1.0,
"ko_truthfulqa": 2.0,
"kobest_hellaswag": 1.0
},
"n-shot": {
"ko_arc_easy": 0,
"ko_common_gen": 0,
"ko_hellaswag": 0,
"ko_truthfulqa": 0,
"kobest_hellaswag": 0
},
"config": {
"model": "hf",
"model_args": "pretrained=/root/simple_trainer/output/gemma-ko-7b/DPO,dtype=float16",
"batch_size": "16",
"batch_sizes": [],
"device": "cuda",
"use_cache": null,
"limit": null,
"bootstrap_iters": 100000,
"gen_kwargs": null
},
"git_hash": "908df18"
}