File size: 4,328 Bytes
b87a3ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# coding=utf-8
# Evaluates fine-tuned models automatically.
# Usage: python evaluate_zh.py --evalset ceval/ceval-exam:law --split dev --output_file result.json
#                              --api_base http://localhost:8000/v1 --task_type choice --n_samples 100
# dataset format: question (string), A (string), B (string), C (string), D (string), answer (Literal["A", "B", "C", "D"])


import os
import fire
import json
import openai
from tqdm import tqdm
from typing import Literal, Optional
from datasets import load_dataset


def format_example_choice(examples):
    model_inputs = {"query": [], "label": []}
    task_template = "请从ABCD四个选项中选出正确的选项,仅输出选项序号。\n{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n答案:"
    for i in range(len(examples["id"])):
        query = task_template.format(
            question=examples["question"][i],
            A=examples["A"][i],
            B=examples["B"][i],
            C=examples["C"][i],
            D=examples["D"][i]
        )
        label = examples["answer"][i]
        model_inputs["query"].append(query)
        model_inputs["label"].append(label)
    return model_inputs


def format_example_cloze(examples):
    model_inputs = {"query": [], "label": []}
    task_template = "请选择正确的答案填空,仅输出正确的选项。\n{question}\n选项:{A}\n{B}\n{C}\n{D}\n答案:"
    for i in range(len(examples["id"])):
        query = task_template.format(
            question=examples["question"][i],
            A=examples["A"][i],
            B=examples["B"][i],
            C=examples["C"][i],
            D=examples["D"][i]
        )
        label = examples[examples["answer"][i]][i]
        model_inputs["query"].append(query)
        model_inputs["label"].append(label)
    return model_inputs


def format_example_openqa(examples):
    model_inputs = {"query": [], "label": []}
    task_template = "回答以下问题:{question}\n答案:"
    for i in range(len(examples["id"])):
        query = task_template.format(question=examples["question"][i])
        label = examples[examples["answer"][i]][i]
        model_inputs["query"].append(query)
        model_inputs["label"].append(label)
    return model_inputs


TASK_DICT = {
    "choice": format_example_choice,
    "cloze": format_example_cloze,
    "openqa": format_example_openqa
}


EXT2TYPE = {
    "csv": "csv",
    "json": "json",
    "jsonl": "json"
}


def evaluate(
        evalset: str,
        api_base: str,
        output_file: str,
        split: Optional[str] = "val",
        task_type: Optional[Literal["choice", "cloze", "openqa"]] = "choice",
        n_samples: Optional[int] = 20
):

    openai.api_base = api_base
    openai.api_key = "none"

    if os.path.isfile(evalset):
        dataset = load_dataset(EXT2TYPE[evalset.split(".")[-1]], data_files=evalset)["train"]
    elif ":" in evalset:
        evalset, subset = evalset.split(":")
        dataset = load_dataset(evalset, subset, split=split)
    else:
        dataset = load_dataset(evalset, split=split)

    n_samples = min(len(dataset), n_samples)

    dataset = dataset.map(TASK_DICT[task_type], batched=True)
    dataset = dataset.select(range(n_samples))

    n_correct = 0
    predictions = []
    for example in tqdm(dataset):
        query, label = example["query"], example["label"]
        predict = openai.ChatCompletion.create(
            model="default",
            messages=[{"role": "user", "content": query}],
            temperature=0.01,
            top_p=0.01,
            max_new_tokens=20
        ).choices[0].message.content

        if task_type == "choice" and predict[0].lower() == label[0].lower():
            n_correct += 1
        if task_type == "cloze" and label in [predict[:len(label)], predict[-len(label):]]:
            n_correct += 1
        if task_type == "openqa" and label in predict:
            n_correct += 1

        predictions.append({
            "query": query,
            "label": label,
            "predict": predict
        })

    print("Result: {}/{}\nAccuracy: {:.2f}%".format(n_correct, n_samples, n_correct / n_samples * 100))

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(predictions, f, indent=2, ensure_ascii=False)


if __name__ == "__main__":
    fire.Fire(evaluate)