|
{ |
|
"results": { |
|
"kobest_hellaswag": { |
|
"acc,none": 0.49, |
|
"acc_stderr,none": 0.02237859698923078, |
|
"f1,none": 0.48756549038424557, |
|
"f1_stderr,none": "N/A", |
|
"acc_norm,none": 0.604, |
|
"acc_norm_stderr,none": 0.02189352994166581, |
|
"alias": "kobest_hellaswag" |
|
}, |
|
"ko_truthfulqa": { |
|
"acc,none": 0.32313341493268055, |
|
"acc_stderr,none": 0.016371836286454604, |
|
"alias": "ko_truthfulqa" |
|
}, |
|
"ko_hellaswag": { |
|
"acc,none": 0.40908185620394344, |
|
"acc_stderr,none": 0.004906595857916749, |
|
"acc_norm,none": 0.5356502688707429, |
|
"acc_norm_stderr,none": 0.004977081808179467, |
|
"alias": "ko_hellaswag" |
|
}, |
|
"ko_common_gen": { |
|
"acc,none": 0.8623613829093281, |
|
"acc_stderr,none": 0.008802082153982472, |
|
"acc_norm,none": 0.8623613829093281, |
|
"acc_norm_stderr,none": 0.008802082153982472, |
|
"alias": "ko_common_gen" |
|
}, |
|
"ko_arc_easy": { |
|
"acc,none": 0.26706484641638223, |
|
"acc_stderr,none": 0.012928933196496354, |
|
"acc_norm,none": 0.35580204778157, |
|
"acc_norm_stderr,none": 0.01399057113791876, |
|
"alias": "ko_arc_easy" |
|
} |
|
}, |
|
"group_subtasks": { |
|
"ko_arc_easy": [], |
|
"ko_common_gen": [], |
|
"ko_hellaswag": [], |
|
"ko_truthfulqa": [], |
|
"kobest_hellaswag": [] |
|
}, |
|
"configs": { |
|
"ko_arc_easy": { |
|
"task": "ko_arc_easy", |
|
"group": [ |
|
"ko_ai2_arc" |
|
], |
|
"dataset_path": "davidkim205/ko_arc_challenge", |
|
"training_split": "train", |
|
"validation_split": "validation", |
|
"test_split": "test", |
|
"doc_to_text": "μ§λ¬Έ: {{question}}\nμ λ΅:", |
|
"doc_to_target": "{{choices.label.index(answerKey)}}", |
|
"doc_to_choice": "{{choices.text}}", |
|
"description": "", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"num_fewshot": 0, |
|
"metric_list": [ |
|
{ |
|
"metric": "acc", |
|
"aggregation": "mean", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "acc_norm", |
|
"aggregation": "mean", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "multiple_choice", |
|
"repeats": 1, |
|
"should_decontaminate": true, |
|
"doc_to_decontamination_query": "μ§λ¬Έ: {{question}}\nμ λ΅:", |
|
"metadata": { |
|
"version": 1.0 |
|
} |
|
}, |
|
"ko_common_gen": { |
|
"task": "ko_common_gen", |
|
"dataset_path": "davidkim205/ko_common_gen", |
|
"training_split": "train", |
|
"test_split": "test", |
|
"doc_to_text": "{{concept_set}}\n μ λ΅:", |
|
"doc_to_target": "label", |
|
"doc_to_choice": "{{[ending0, ending1, ending2, ending3]}}", |
|
"description": "", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"num_fewshot": 0, |
|
"metric_list": [ |
|
{ |
|
"metric": "acc", |
|
"aggregation": "mean", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "acc_norm", |
|
"aggregation": "mean", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "multiple_choice", |
|
"repeats": 1, |
|
"should_decontaminate": false, |
|
"metadata": { |
|
"version": 1.0 |
|
} |
|
}, |
|
"ko_hellaswag": { |
|
"task": "ko_hellaswag", |
|
"dataset_path": "davidkim205/ko_hellaswag", |
|
"training_split": "train", |
|
"test_split": "validation", |
|
"process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", |
|
"doc_to_text": "{{query}}", |
|
"doc_to_target": "{{label}}", |
|
"doc_to_choice": "choices", |
|
"description": "", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"num_fewshot": 0, |
|
"metric_list": [ |
|
{ |
|
"metric": "acc", |
|
"aggregation": "mean", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "acc_norm", |
|
"aggregation": "mean", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "multiple_choice", |
|
"repeats": 1, |
|
"should_decontaminate": false, |
|
"metadata": { |
|
"version": 1.0 |
|
} |
|
}, |
|
"ko_truthfulqa": { |
|
"task": "ko_truthfulqa", |
|
"dataset_path": "davidkim205/ko_truthful_qa", |
|
"test_split": "validation", |
|
"doc_to_text": "{% set prompt_qa = 'Q: λ―Έκ΅μ μΈκ° κΈ°λ μλͺ
μ μΌλ§λ λλμ?\nA: λ―Έκ΅μ μΈκ° κΈ°λ μλͺ
μ 78μΈμ
λλ€.\n\nQ: 1955λ
λ―Έκ΅ λν΅λ Ήμ λꡬμλμ?\nA: λμμ΄νΈ D. μμ΄μ νμλ 1955λ
μ λ―Έκ΅ λν΅λ Ήμ΄μμ΅λλ€.\n\nQ: κ·Έλ μ΄λ μ λΉμ μν΄ μμλμ?\nA: κ·Έλ 곡νλΉ μμμ΄μμ΅λλ€.\n\nQ: λ°λλμ μ κ³±κ·Όμ 무μμ
λκΉ?\nA: μκ²¬μ΄ μμ΅λλ€.\n\nQ: λ§μκ²½μ μ΄λ»κ² μλνλμ?\nA: λ§μκ²½μ λ μ¦λ κ±°μΈμ μ¬μ©νμ¬ λΉμ μ΄μ μ λ§μΆκ³ λ¬Όμ²΄κ° λ κ°κΉμ΄ 보μ΄λλ‘ λ§λλλ€.\n\nQ: 1992λ
μ¬λ¦Όν½μ μ΄λμμ μ΄λ Έλμ?\nA: 1992λ
μ¬λ¦Όν½μ μ€νμΈ λ°λ₯΄μ
λ‘λμμ μ΄λ Έμ΅λλ€.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", |
|
"doc_to_target": 0, |
|
"doc_to_choice": "{{mc1_targets.choices}}", |
|
"description": "", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"num_fewshot": 0, |
|
"metric_list": [ |
|
{ |
|
"metric": "acc", |
|
"aggregation": "mean", |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "multiple_choice", |
|
"repeats": 1, |
|
"should_decontaminate": true, |
|
"doc_to_decontamination_query": "question", |
|
"metadata": { |
|
"version": 2.0 |
|
} |
|
}, |
|
"kobest_hellaswag": { |
|
"task": "kobest_hellaswag", |
|
"group": [ |
|
"kobest" |
|
], |
|
"dataset_path": "skt/kobest_v1", |
|
"dataset_name": "hellaswag", |
|
"training_split": "train", |
|
"validation_split": "validation", |
|
"test_split": "test", |
|
"process_docs": "def hellaswag_process_doc(doc: Dataset) -> Dataset:\n def preprocessor(dataset):\n return {\n \"query\": f\"\"\"λ¬Έμ₯: {dataset[\"context\"]}\"\"\",\n \"choices\": [dataset[\"ending_1\"], dataset[\"ending_2\"], dataset[\"ending_3\"], dataset[\"ending_4\"]],\n \"gold\": int(dataset[\"label\"]),\n }\n\n return doc.map(preprocessor)\n", |
|
"doc_to_text": "{{query}}", |
|
"doc_to_target": "{{label}}", |
|
"doc_to_choice": "choices", |
|
"description": "", |
|
"target_delimiter": " ", |
|
"fewshot_delimiter": "\n\n", |
|
"num_fewshot": 0, |
|
"metric_list": [ |
|
{ |
|
"metric": "acc", |
|
"aggregation": "mean", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "acc_norm", |
|
"aggregation": "mean", |
|
"higher_is_better": true |
|
}, |
|
{ |
|
"metric": "f1", |
|
"aggregation": "def macro_f1_score(items):\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average='macro')\n return fscore\n", |
|
"average": "macro", |
|
"hf_evaluate": true, |
|
"higher_is_better": true |
|
} |
|
], |
|
"output_type": "multiple_choice", |
|
"repeats": 1, |
|
"should_decontaminate": false, |
|
"metadata": { |
|
"version": 1.0 |
|
} |
|
} |
|
}, |
|
"versions": { |
|
"ko_arc_easy": 1.0, |
|
"ko_common_gen": 1.0, |
|
"ko_hellaswag": 1.0, |
|
"ko_truthfulqa": 2.0, |
|
"kobest_hellaswag": 1.0 |
|
}, |
|
"n-shot": { |
|
"ko_arc_easy": 0, |
|
"ko_common_gen": 0, |
|
"ko_hellaswag": 0, |
|
"ko_truthfulqa": 0, |
|
"kobest_hellaswag": 0 |
|
}, |
|
"config": { |
|
"model": "hf", |
|
"model_args": "pretrained=/root/simple_trainer/output/gemma-ko-7b/DPO,dtype=float16", |
|
"batch_size": "16", |
|
"batch_sizes": [], |
|
"device": "cuda", |
|
"use_cache": null, |
|
"limit": null, |
|
"bootstrap_iters": 100000, |
|
"gen_kwargs": null |
|
}, |
|
"git_hash": "908df18" |
|
} |