|
{ |
|
"results": { |
|
"hendrycksTest-abstract_algebra": { |
|
"acc": 0.32, |
|
"acc_stderr": 0.046882617226215034, |
|
"acc_norm": 0.32, |
|
"acc_norm_stderr": 0.046882617226215034 |
|
}, |
|
"hendrycksTest-anatomy": { |
|
"acc": 0.4666666666666667, |
|
"acc_stderr": 0.043097329010363554, |
|
"acc_norm": 0.4666666666666667, |
|
"acc_norm_stderr": 0.043097329010363554 |
|
}, |
|
"hendrycksTest-astronomy": { |
|
"acc": 0.39473684210526316, |
|
"acc_stderr": 0.039777499346220734, |
|
"acc_norm": 0.39473684210526316, |
|
"acc_norm_stderr": 0.039777499346220734 |
|
}, |
|
"hendrycksTest-business_ethics": { |
|
"acc": 0.49, |
|
"acc_stderr": 0.05024183937956912, |
|
"acc_norm": 0.49, |
|
"acc_norm_stderr": 0.05024183937956912 |
|
}, |
|
"hendrycksTest-clinical_knowledge": { |
|
"acc": 0.47547169811320755, |
|
"acc_stderr": 0.030735822206205608, |
|
"acc_norm": 0.47547169811320755, |
|
"acc_norm_stderr": 0.030735822206205608 |
|
}, |
|
"hendrycksTest-college_biology": { |
|
"acc": 0.4583333333333333, |
|
"acc_stderr": 0.04166666666666666, |
|
"acc_norm": 0.4583333333333333, |
|
"acc_norm_stderr": 0.04166666666666666 |
|
}, |
|
"hendrycksTest-college_chemistry": { |
|
"acc": 0.29, |
|
"acc_stderr": 0.04560480215720684, |
|
"acc_norm": 0.29, |
|
"acc_norm_stderr": 0.04560480215720684 |
|
}, |
|
"hendrycksTest-college_computer_science": { |
|
"acc": 0.38, |
|
"acc_stderr": 0.048783173121456316, |
|
"acc_norm": 0.38, |
|
"acc_norm_stderr": 0.048783173121456316 |
|
}, |
|
"hendrycksTest-college_mathematics": { |
|
"acc": 0.25, |
|
"acc_stderr": 0.04351941398892446, |
|
"acc_norm": 0.25, |
|
"acc_norm_stderr": 0.04351941398892446 |
|
}, |
|
"hendrycksTest-college_medicine": { |
|
"acc": 0.4046242774566474, |
|
"acc_stderr": 0.03742461193887248, |
|
"acc_norm": 0.4046242774566474, |
|
"acc_norm_stderr": 0.03742461193887248 |
|
}, |
|
"hendrycksTest-college_physics": { |
|
"acc": 0.24509803921568626, |
|
"acc_stderr": 0.042801058373643966, |
|
"acc_norm": 0.24509803921568626, |
|
"acc_norm_stderr": 0.042801058373643966 |
|
}, |
|
"hendrycksTest-computer_security": { |
|
"acc": 0.54, |
|
"acc_stderr": 0.05009082659620332, |
|
"acc_norm": 0.54, |
|
"acc_norm_stderr": 0.05009082659620332 |
|
}, |
|
"hendrycksTest-conceptual_physics": { |
|
"acc": 0.40425531914893614, |
|
"acc_stderr": 0.03208115750788684, |
|
"acc_norm": 0.40425531914893614, |
|
"acc_norm_stderr": 0.03208115750788684 |
|
}, |
|
"hendrycksTest-econometrics": { |
|
"acc": 0.3157894736842105, |
|
"acc_stderr": 0.043727482902780064, |
|
"acc_norm": 0.3157894736842105, |
|
"acc_norm_stderr": 0.043727482902780064 |
|
}, |
|
"hendrycksTest-electrical_engineering": { |
|
"acc": 0.4413793103448276, |
|
"acc_stderr": 0.04137931034482757, |
|
"acc_norm": 0.4413793103448276, |
|
"acc_norm_stderr": 0.04137931034482757 |
|
}, |
|
"hendrycksTest-elementary_mathematics": { |
|
"acc": 0.2830687830687831, |
|
"acc_stderr": 0.023201392938194974, |
|
"acc_norm": 0.2830687830687831, |
|
"acc_norm_stderr": 0.023201392938194974 |
|
}, |
|
"hendrycksTest-formal_logic": { |
|
"acc": 0.23015873015873015, |
|
"acc_stderr": 0.037649508797906045, |
|
"acc_norm": 0.23015873015873015, |
|
"acc_norm_stderr": 0.037649508797906045 |
|
}, |
|
"hendrycksTest-global_facts": { |
|
"acc": 0.34, |
|
"acc_stderr": 0.04760952285695235, |
|
"acc_norm": 0.34, |
|
"acc_norm_stderr": 0.04760952285695235 |
|
}, |
|
"hendrycksTest-high_school_biology": { |
|
"acc": 0.47419354838709676, |
|
"acc_stderr": 0.028406095057653326, |
|
"acc_norm": 0.47419354838709676, |
|
"acc_norm_stderr": 0.028406095057653326 |
|
}, |
|
"hendrycksTest-high_school_chemistry": { |
|
"acc": 0.3251231527093596, |
|
"acc_stderr": 0.032957975663112704, |
|
"acc_norm": 0.3251231527093596, |
|
"acc_norm_stderr": 0.032957975663112704 |
|
}, |
|
"hendrycksTest-high_school_computer_science": { |
|
"acc": 0.44, |
|
"acc_stderr": 0.04988876515698589, |
|
"acc_norm": 0.44, |
|
"acc_norm_stderr": 0.04988876515698589 |
|
}, |
|
"hendrycksTest-high_school_european_history": { |
|
"acc": 0.6363636363636364, |
|
"acc_stderr": 0.03756335775187897, |
|
"acc_norm": 0.6363636363636364, |
|
"acc_norm_stderr": 0.03756335775187897 |
|
}, |
|
"hendrycksTest-high_school_geography": { |
|
"acc": 0.5404040404040404, |
|
"acc_stderr": 0.035507024651313425, |
|
"acc_norm": 0.5404040404040404, |
|
"acc_norm_stderr": 0.035507024651313425 |
|
}, |
|
"hendrycksTest-high_school_government_and_politics": { |
|
"acc": 0.6269430051813472, |
|
"acc_stderr": 0.03490205592048573, |
|
"acc_norm": 0.6269430051813472, |
|
"acc_norm_stderr": 0.03490205592048573 |
|
}, |
|
"hendrycksTest-high_school_macroeconomics": { |
|
"acc": 0.41025641025641024, |
|
"acc_stderr": 0.024939313906940784, |
|
"acc_norm": 0.41025641025641024, |
|
"acc_norm_stderr": 0.024939313906940784 |
|
}, |
|
"hendrycksTest-high_school_mathematics": { |
|
"acc": 0.2740740740740741, |
|
"acc_stderr": 0.027195934804085622, |
|
"acc_norm": 0.2740740740740741, |
|
"acc_norm_stderr": 0.027195934804085622 |
|
}, |
|
"hendrycksTest-high_school_microeconomics": { |
|
"acc": 0.42436974789915966, |
|
"acc_stderr": 0.03210479051015776, |
|
"acc_norm": 0.42436974789915966, |
|
"acc_norm_stderr": 0.03210479051015776 |
|
}, |
|
"hendrycksTest-high_school_physics": { |
|
"acc": 0.271523178807947, |
|
"acc_stderr": 0.03631329803969653, |
|
"acc_norm": 0.271523178807947, |
|
"acc_norm_stderr": 0.03631329803969653 |
|
}, |
|
"hendrycksTest-high_school_psychology": { |
|
"acc": 0.618348623853211, |
|
"acc_stderr": 0.020828148517022582, |
|
"acc_norm": 0.618348623853211, |
|
"acc_norm_stderr": 0.020828148517022582 |
|
}, |
|
"hendrycksTest-high_school_statistics": { |
|
"acc": 0.27314814814814814, |
|
"acc_stderr": 0.03038805130167812, |
|
"acc_norm": 0.27314814814814814, |
|
"acc_norm_stderr": 0.03038805130167812 |
|
}, |
|
"hendrycksTest-high_school_us_history": { |
|
"acc": 0.6470588235294118, |
|
"acc_stderr": 0.03354092437591518, |
|
"acc_norm": 0.6470588235294118, |
|
"acc_norm_stderr": 0.03354092437591518 |
|
}, |
|
"hendrycksTest-high_school_world_history": { |
|
"acc": 0.6371308016877637, |
|
"acc_stderr": 0.03129920825530213, |
|
"acc_norm": 0.6371308016877637, |
|
"acc_norm_stderr": 0.03129920825530213 |
|
}, |
|
"hendrycksTest-human_aging": { |
|
"acc": 0.5560538116591929, |
|
"acc_stderr": 0.03334625674242728, |
|
"acc_norm": 0.5560538116591929, |
|
"acc_norm_stderr": 0.03334625674242728 |
|
}, |
|
"hendrycksTest-human_sexuality": { |
|
"acc": 0.549618320610687, |
|
"acc_stderr": 0.04363643698524779, |
|
"acc_norm": 0.549618320610687, |
|
"acc_norm_stderr": 0.04363643698524779 |
|
}, |
|
"hendrycksTest-international_law": { |
|
"acc": 0.628099173553719, |
|
"acc_stderr": 0.04412015806624504, |
|
"acc_norm": 0.628099173553719, |
|
"acc_norm_stderr": 0.04412015806624504 |
|
}, |
|
"hendrycksTest-jurisprudence": { |
|
"acc": 0.48148148148148145, |
|
"acc_stderr": 0.04830366024635331, |
|
"acc_norm": 0.48148148148148145, |
|
"acc_norm_stderr": 0.04830366024635331 |
|
}, |
|
"hendrycksTest-logical_fallacies": { |
|
"acc": 0.5214723926380368, |
|
"acc_stderr": 0.03924746876751129, |
|
"acc_norm": 0.5214723926380368, |
|
"acc_norm_stderr": 0.03924746876751129 |
|
}, |
|
"hendrycksTest-machine_learning": { |
|
"acc": 0.42857142857142855, |
|
"acc_stderr": 0.04697113923010212, |
|
"acc_norm": 0.42857142857142855, |
|
"acc_norm_stderr": 0.04697113923010212 |
|
}, |
|
"hendrycksTest-management": { |
|
"acc": 0.5242718446601942, |
|
"acc_stderr": 0.049449010929737795, |
|
"acc_norm": 0.5242718446601942, |
|
"acc_norm_stderr": 0.049449010929737795 |
|
}, |
|
"hendrycksTest-marketing": { |
|
"acc": 0.7094017094017094, |
|
"acc_stderr": 0.02974504857267408, |
|
"acc_norm": 0.7094017094017094, |
|
"acc_norm_stderr": 0.02974504857267408 |
|
}, |
|
"hendrycksTest-medical_genetics": { |
|
"acc": 0.56, |
|
"acc_stderr": 0.04988876515698589, |
|
"acc_norm": 0.56, |
|
"acc_norm_stderr": 0.04988876515698589 |
|
}, |
|
"hendrycksTest-miscellaneous": { |
|
"acc": 0.6194125159642401, |
|
"acc_stderr": 0.017362564126075414, |
|
"acc_norm": 0.6194125159642401, |
|
"acc_norm_stderr": 0.017362564126075414 |
|
}, |
|
"hendrycksTest-moral_disputes": { |
|
"acc": 0.5260115606936416, |
|
"acc_stderr": 0.026882643434022885, |
|
"acc_norm": 0.5260115606936416, |
|
"acc_norm_stderr": 0.026882643434022885 |
|
}, |
|
"hendrycksTest-moral_scenarios": { |
|
"acc": 0.23798882681564246, |
|
"acc_stderr": 0.014242630070574911, |
|
"acc_norm": 0.23798882681564246, |
|
"acc_norm_stderr": 0.014242630070574911 |
|
}, |
|
"hendrycksTest-nutrition": { |
|
"acc": 0.4803921568627451, |
|
"acc_stderr": 0.028607893699576063, |
|
"acc_norm": 0.4803921568627451, |
|
"acc_norm_stderr": 0.028607893699576063 |
|
}, |
|
"hendrycksTest-philosophy": { |
|
"acc": 0.5594855305466238, |
|
"acc_stderr": 0.028196400574197426, |
|
"acc_norm": 0.5594855305466238, |
|
"acc_norm_stderr": 0.028196400574197426 |
|
}, |
|
"hendrycksTest-prehistory": { |
|
"acc": 0.5462962962962963, |
|
"acc_stderr": 0.0277012284685426, |
|
"acc_norm": 0.5462962962962963, |
|
"acc_norm_stderr": 0.0277012284685426 |
|
}, |
|
"hendrycksTest-professional_accounting": { |
|
"acc": 0.36524822695035464, |
|
"acc_stderr": 0.02872386385328128, |
|
"acc_norm": 0.36524822695035464, |
|
"acc_norm_stderr": 0.02872386385328128 |
|
}, |
|
"hendrycksTest-professional_law": { |
|
"acc": 0.36571056062581486, |
|
"acc_stderr": 0.012301028188840567, |
|
"acc_norm": 0.36571056062581486, |
|
"acc_norm_stderr": 0.012301028188840567 |
|
}, |
|
"hendrycksTest-professional_medicine": { |
|
"acc": 0.4411764705882353, |
|
"acc_stderr": 0.030161911930767105, |
|
"acc_norm": 0.4411764705882353, |
|
"acc_norm_stderr": 0.030161911930767105 |
|
}, |
|
"hendrycksTest-professional_psychology": { |
|
"acc": 0.44281045751633985, |
|
"acc_stderr": 0.020095083154577347, |
|
"acc_norm": 0.44281045751633985, |
|
"acc_norm_stderr": 0.020095083154577347 |
|
}, |
|
"hendrycksTest-public_relations": { |
|
"acc": 0.5727272727272728, |
|
"acc_stderr": 0.04738198703545483, |
|
"acc_norm": 0.5727272727272728, |
|
"acc_norm_stderr": 0.04738198703545483 |
|
}, |
|
"hendrycksTest-security_studies": { |
|
"acc": 0.49795918367346936, |
|
"acc_stderr": 0.0320089533497105, |
|
"acc_norm": 0.49795918367346936, |
|
"acc_norm_stderr": 0.0320089533497105 |
|
}, |
|
"hendrycksTest-sociology": { |
|
"acc": 0.6318407960199005, |
|
"acc_stderr": 0.03410410565495301, |
|
"acc_norm": 0.6318407960199005, |
|
"acc_norm_stderr": 0.03410410565495301 |
|
}, |
|
"hendrycksTest-us_foreign_policy": { |
|
"acc": 0.66, |
|
"acc_stderr": 0.04760952285695237, |
|
"acc_norm": 0.66, |
|
"acc_norm_stderr": 0.04760952285695237 |
|
}, |
|
"hendrycksTest-virology": { |
|
"acc": 0.4036144578313253, |
|
"acc_stderr": 0.038194861407583984, |
|
"acc_norm": 0.4036144578313253, |
|
"acc_norm_stderr": 0.038194861407583984 |
|
}, |
|
"hendrycksTest-world_religions": { |
|
"acc": 0.6432748538011696, |
|
"acc_stderr": 0.03674013002860954, |
|
"acc_norm": 0.6432748538011696, |
|
"acc_norm_stderr": 0.03674013002860954 |
|
} |
|
}, |
|
"versions": { |
|
"hendrycksTest-abstract_algebra": 1, |
|
"hendrycksTest-anatomy": 1, |
|
"hendrycksTest-astronomy": 1, |
|
"hendrycksTest-business_ethics": 1, |
|
"hendrycksTest-clinical_knowledge": 1, |
|
"hendrycksTest-college_biology": 1, |
|
"hendrycksTest-college_chemistry": 1, |
|
"hendrycksTest-college_computer_science": 1, |
|
"hendrycksTest-college_mathematics": 1, |
|
"hendrycksTest-college_medicine": 1, |
|
"hendrycksTest-college_physics": 1, |
|
"hendrycksTest-computer_security": 1, |
|
"hendrycksTest-conceptual_physics": 1, |
|
"hendrycksTest-econometrics": 1, |
|
"hendrycksTest-electrical_engineering": 1, |
|
"hendrycksTest-elementary_mathematics": 1, |
|
"hendrycksTest-formal_logic": 1, |
|
"hendrycksTest-global_facts": 1, |
|
"hendrycksTest-high_school_biology": 1, |
|
"hendrycksTest-high_school_chemistry": 1, |
|
"hendrycksTest-high_school_computer_science": 1, |
|
"hendrycksTest-high_school_european_history": 1, |
|
"hendrycksTest-high_school_geography": 1, |
|
"hendrycksTest-high_school_government_and_politics": 1, |
|
"hendrycksTest-high_school_macroeconomics": 1, |
|
"hendrycksTest-high_school_mathematics": 1, |
|
"hendrycksTest-high_school_microeconomics": 1, |
|
"hendrycksTest-high_school_physics": 1, |
|
"hendrycksTest-high_school_psychology": 1, |
|
"hendrycksTest-high_school_statistics": 1, |
|
"hendrycksTest-high_school_us_history": 1, |
|
"hendrycksTest-high_school_world_history": 1, |
|
"hendrycksTest-human_aging": 1, |
|
"hendrycksTest-human_sexuality": 1, |
|
"hendrycksTest-international_law": 1, |
|
"hendrycksTest-jurisprudence": 1, |
|
"hendrycksTest-logical_fallacies": 1, |
|
"hendrycksTest-machine_learning": 1, |
|
"hendrycksTest-management": 1, |
|
"hendrycksTest-marketing": 1, |
|
"hendrycksTest-medical_genetics": 1, |
|
"hendrycksTest-miscellaneous": 1, |
|
"hendrycksTest-moral_disputes": 1, |
|
"hendrycksTest-moral_scenarios": 1, |
|
"hendrycksTest-nutrition": 1, |
|
"hendrycksTest-philosophy": 1, |
|
"hendrycksTest-prehistory": 1, |
|
"hendrycksTest-professional_accounting": 1, |
|
"hendrycksTest-professional_law": 1, |
|
"hendrycksTest-professional_medicine": 1, |
|
"hendrycksTest-professional_psychology": 1, |
|
"hendrycksTest-public_relations": 1, |
|
"hendrycksTest-security_studies": 1, |
|
"hendrycksTest-sociology": 1, |
|
"hendrycksTest-us_foreign_policy": 1, |
|
"hendrycksTest-virology": 1, |
|
"hendrycksTest-world_religions": 1 |
|
}, |
|
"config": { |
|
"model": "sparseml", |
|
"model_args": "pretrained=/nm/drive1/alexandre/zoo/llama2-7b-ultrachat200k_llama2_pretrain-base/training,dtype=float32", |
|
"num_fewshot": 5, |
|
"batch_size": "4", |
|
"batch_sizes": [], |
|
"device": "cuda:7", |
|
"no_cache": true, |
|
"limit": null, |
|
"bootstrap_iters": 100000, |
|
"description_dict": {} |
|
} |
|
} |