gemma-ko-7b-v1.0 / results.json

init

1f97149 8 months ago

8.31 kB

	{
	"results": {
	"kobest_hellaswag": {
	"acc,none": 0.49,
	"acc_stderr,none": 0.02237859698923078,
	"f1,none": 0.48756549038424557,
	"f1_stderr,none": "N/A",
	"acc_norm,none": 0.604,
	"acc_norm_stderr,none": 0.02189352994166581,
	"alias": "kobest_hellaswag"
	},
	"ko_truthfulqa": {
	"acc,none": 0.32313341493268055,
	"acc_stderr,none": 0.016371836286454604,
	"alias": "ko_truthfulqa"
	},
	"ko_hellaswag": {
	"acc,none": 0.40908185620394344,
	"acc_stderr,none": 0.004906595857916749,
	"acc_norm,none": 0.5356502688707429,
	"acc_norm_stderr,none": 0.004977081808179467,
	"alias": "ko_hellaswag"
	},
	"ko_common_gen": {
	"acc,none": 0.8623613829093281,
	"acc_stderr,none": 0.008802082153982472,
	"acc_norm,none": 0.8623613829093281,
	"acc_norm_stderr,none": 0.008802082153982472,
	"alias": "ko_common_gen"
	},
	"ko_arc_easy": {
	"acc,none": 0.26706484641638223,
	"acc_stderr,none": 0.012928933196496354,
	"acc_norm,none": 0.35580204778157,
	"acc_norm_stderr,none": 0.01399057113791876,
	"alias": "ko_arc_easy"
	}
	},
	"group_subtasks": {
	"ko_arc_easy": [],
	"ko_common_gen": [],
	"ko_hellaswag": [],
	"ko_truthfulqa": [],
	"kobest_hellaswag": []
	},
	"configs": {
	"ko_arc_easy": {
	"task": "ko_arc_easy",
	"group": [
	"ko_ai2_arc"
	],
	"dataset_path": "davidkim205/ko_arc_challenge",
	"training_split": "train",
	"validation_split": "validation",
	"test_split": "test",
	"doc_to_text": "질문: {{question}}\n정답:",
	"doc_to_target": "{{choices.label.index(answerKey)}}",
	"doc_to_choice": "{{choices.text}}",
	"description": "",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	},
	{
	"metric": "acc_norm",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": true,
	"doc_to_decontamination_query": "질문: {{question}}\n정답:",
	"metadata": {
	"version": 1.0
	}
	},
	"ko_common_gen": {
	"task": "ko_common_gen",
	"dataset_path": "davidkim205/ko_common_gen",
	"training_split": "train",
	"test_split": "test",
	"doc_to_text": "{{concept_set}}\n 정답:",
	"doc_to_target": "label",
	"doc_to_choice": "{{[ending0, ending1, ending2, ending3]}}",
	"description": "",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	},
	{
	"metric": "acc_norm",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"ko_hellaswag": {
	"task": "ko_hellaswag",
	"dataset_path": "davidkim205/ko_hellaswag",
	"training_split": "train",
	"test_split": "validation",
	"process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
	"doc_to_text": "{{query}}",
	"doc_to_target": "{{label}}",
	"doc_to_choice": "choices",
	"description": "",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	},
	{
	"metric": "acc_norm",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"ko_truthfulqa": {
	"task": "ko_truthfulqa",
	"dataset_path": "davidkim205/ko_truthful_qa",
	"test_split": "validation",
	"doc_to_text": "{% set prompt_qa = 'Q: 미국의 인간 기대 수명은 얼마나 되나요?\nA: 미국의 인간 기대 수명은 78세입니다.\n\nQ: 1955년 미국 대통령은 누구였나요?\nA: 드와이트 D. 아이젠하워는 1955년에 미국 대통령이었습니다.\n\nQ: 그는 어느 정당에 속해 있었나요?\nA: 그는 공화당 소속이었습니다.\n\nQ: 바나나의 제곱근은 무엇입니까?\nA: 의견이 없습니다.\n\nQ: 망원경은 어떻게 작동하나요?\nA: 망원경은 렌즈나 거울을 사용하여 빛의 초점을 맞추고 물체가 더 가까이 보이도록 만듭니다.\n\nQ: 1992년 올림픽은 어디에서 열렸나요?\nA: 1992년 올림픽은 스페인 바르셀로나에서 열렸습니다.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
	"doc_to_target": 0,
	"doc_to_choice": "{{mc1_targets.choices}}",
	"description": "",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": true,
	"doc_to_decontamination_query": "question",
	"metadata": {
	"version": 2.0
	}
	},
	"kobest_hellaswag": {
	"task": "kobest_hellaswag",
	"group": [
	"kobest"
	],
	"dataset_path": "skt/kobest_v1",
	"dataset_name": "hellaswag",
	"training_split": "train",
	"validation_split": "validation",
	"test_split": "test",
	"process_docs": "def hellaswag_process_doc(doc: Dataset) -> Dataset:\n def preprocessor(dataset):\n return {\n \"query\": f\"\"\"문장: {dataset[\"context\"]}\"\"\",\n \"choices\": [dataset[\"ending_1\"], dataset[\"ending_2\"], dataset[\"ending_3\"], dataset[\"ending_4\"]],\n \"gold\": int(dataset[\"label\"]),\n }\n\n return doc.map(preprocessor)\n",
	"doc_to_text": "{{query}}",
	"doc_to_target": "{{label}}",
	"doc_to_choice": "choices",
	"description": "",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	},
	{
	"metric": "acc_norm",
	"aggregation": "mean",
	"higher_is_better": true
	},
	{
	"metric": "f1",
	"aggregation": "def macro_f1_score(items):\n unzipped_list = list(zip(*items))\n golds = unzipped_list[0]\n preds = unzipped_list[1]\n fscore = f1_score(golds, preds, average='macro')\n return fscore\n",
	"average": "macro",
	"hf_evaluate": true,
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	}
	},
	"versions": {
	"ko_arc_easy": 1.0,
	"ko_common_gen": 1.0,
	"ko_hellaswag": 1.0,
	"ko_truthfulqa": 2.0,
	"kobest_hellaswag": 1.0
	},
	"n-shot": {
	"ko_arc_easy": 0,
	"ko_common_gen": 0,
	"ko_hellaswag": 0,
	"ko_truthfulqa": 0,
	"kobest_hellaswag": 0
	},
	"config": {
	"model": "hf",
	"model_args": "pretrained=/root/simple_trainer/output/gemma-ko-7b/DPO,dtype=float16",
	"batch_size": "16",
	"batch_sizes": [],
	"device": "cuda",
	"use_cache": null,
	"limit": null,
	"bootstrap_iters": 100000,
	"gen_kwargs": null
	},
	"git_hash": "908df18"
	}