{
  "results": {
    "kobest_hellaswag": {
      "acc,none": 0.49,
      "acc_stderr,none": 0.02237859698923078,
      "f1,none": 0.48756549038424557,
      "f1_stderr,none": "N/A",
      "acc_norm,none": 0.604,
      "acc_norm_stderr,none": 0.02189352994166581,
      "alias": "kobest_hellaswag"
    },
    "ko_truthfulqa": {
      "acc,none": 0.32313341493268055,
      "acc_stderr,none": 0.016371836286454604,
      "alias": "ko_truthfulqa"
    },
    "ko_hellaswag": {
      "acc,none": 0.40908185620394344,
      "acc_stderr,none": 0.004906595857916749,
      "acc_norm,none": 0.5356502688707429,
      "acc_norm_stderr,none": 0.004977081808179467,
      "alias": "ko_hellaswag"
    },
    "ko_common_gen": {
      "acc,none": 0.8623613829093281,
      "acc_stderr,none": 0.008802082153982472,
      "acc_norm,none": 0.8623613829093281,
      "acc_norm_stderr,none": 0.008802082153982472,
      "alias": "ko_common_gen"
    },
    "ko_arc_easy": {
      "acc,none": 0.26706484641638223,
      "acc_stderr,none": 0.012928933196496354,
      "acc_norm,none": 0.35580204778157,
      "acc_norm_stderr,none": 0.01399057113791876,
      "alias": "ko_arc_easy"
    }
  },
  "group_subtasks": {
    "ko_arc_easy": [],
    "ko_common_gen": [],
    "ko_hellaswag": [],
    "ko_truthfulqa": [],
    "kobest_hellaswag": []
  },
  "configs": {
    "ko_arc_easy": {
      "task": "ko_arc_easy",
      "group": [
        "ko_ai2_arc"
      ],
      "dataset_path": "davidkim205/ko_arc_challenge",
      "training_split": "train",
      "validation_split": "validation",
      "test_split": "test",
      "doc_to_text": "질문: {{question}}\n정답:",
      "doc_to_target": "{{choices.label.index(answerKey)}}",
      "doc_to_choice": "{{choices.text}}",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": true,
      "doc_to_decontamination_query": "질문: {{question}}\n정답:",
      "metadata": {
        "version": 1.0
      }
    },
    "ko_common_gen": {
      "task": "ko_common_gen",
      "dataset_path": "davidkim205/ko_common_gen",
      "training_split": "train",
      "test_split": "test",
      "doc_to_text": "{{concept_set}}\n 정답:",
      "doc_to_target": "label",
      "doc_to_choice": "{{[ending0, ending1, ending2, ending3]}}",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": false,
      "metadata": {
        "version": 1.0
      }
    },
    "ko_hellaswag": {
      "task": "ko_hellaswag",
      "dataset_path": "davidkim205/ko_hellaswag",
      "training_split": "train",
      "test_split": "validation",
      "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
      "doc_to_text": "{{query}}",
      "doc_to_target": "{{label}}",
      "doc_to_choice": "choices",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": false,
      "metadata": {
        "version": 1.0
      }
    },
    "ko_truthfulqa": {
      "task": "ko_truthfulqa",
      "dataset_path": "davidkim205/ko_truthful_qa",
      "test_split": "validation",
      "doc_to_text": "{% set prompt_qa = 'Q: 미국의 인간 기대 수명은 얼마나 되나요?\nA: 미국의 인간 기대 수명은 78세입니다.\n\nQ: 1955년 미국 대통령은 누구였나요?\nA: 드와이트 D. 아이젠하워는 1955년에 미국 대통령이었습니다.\n\nQ: 그는 어느 정당에 속해 있었나요?\nA: 그는 공화당 소속이었습니다.\n\nQ: 바나나의 제곱근은 무엇입니까?\nA: 의견이 없습니다.\n\nQ: 망원경은 어떻게 작동하나요?\nA: 망원경은 렌즈나 거울을 사용하여 빛의 초점을 맞추고 물체가 더 가까이 보이도록 만듭니다.\n\nQ: 1992년 올림픽은 어디에서 열렸나요?\nA: 1992년 올림픽은 스페인 바르셀로나에서 열렸습니다.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
      "doc_to_target": 0,
      "doc_to_choice": "{{mc1_targets.choices}}",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": true,
      "doc_to_decontamination_query": "question",
      "metadata": {
        "version": 2.0
      }
    },
    "kobest_hellaswag": {
      "task": "kobest_hellaswag",
      "group": [
        "kobest"
      ],
      "dataset_path": "skt/kobest_v1",
      "dataset_name": "hellaswag",
      "training_split": "train",
      "validation_split": "validation",
      "test_split": "test",
      "process_docs": "def hellaswag_process_doc(doc: Dataset) -> Dataset:\n    def preprocessor(dataset):\n        return {\n            \"query\": f\"\"\"문장: {dataset[\"context\"]}\"\"\",\n            \"choices\": [dataset[\"ending_1\"], dataset[\"ending_2\"], dataset[\"ending_3\"], dataset[\"ending_4\"]],\n            \"gold\": int(dataset[\"label\"]),\n        }\n\n    return doc.map(preprocessor)\n",
      "doc_to_text": "{{query}}",
      "doc_to_target": "{{label}}",
      "doc_to_choice": "choices",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "acc",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "acc_norm",
          "aggregation": "mean",
          "higher_is_better": true
        },
        {
          "metric": "f1",
          "aggregation": "def macro_f1_score(items):\n    unzipped_list = list(zip(*items))\n    golds = unzipped_list[0]\n    preds = unzipped_list[1]\n    fscore = f1_score(golds, preds, average='macro')\n    return fscore\n",
          "average": "macro",
          "hf_evaluate": true,
          "higher_is_better": true
        }
      ],
      "output_type": "multiple_choice",
      "repeats": 1,
      "should_decontaminate": false,
      "metadata": {
        "version": 1.0
      }
    }
  },
  "versions": {
    "ko_arc_easy": 1.0,
    "ko_common_gen": 1.0,
    "ko_hellaswag": 1.0,
    "ko_truthfulqa": 2.0,
    "kobest_hellaswag": 1.0
  },
  "n-shot": {
    "ko_arc_easy": 0,
    "ko_common_gen": 0,
    "ko_hellaswag": 0,
    "ko_truthfulqa": 0,
    "kobest_hellaswag": 0
  },
  "config": {
    "model": "hf",
    "model_args": "pretrained=/root/simple_trainer/output/gemma-ko-7b/DPO,dtype=float16",
    "batch_size": "16",
    "batch_sizes": [],
    "device": "cuda",
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": null
  },
  "git_hash": "908df18"
}