|
{ |
|
"config_general": { |
|
"lighteval_sha": "494ee12240e716e804ae9ea834f84a2c864c07ca", |
|
"num_few_shot_default": 0, |
|
"num_fewshot_seeds": 1, |
|
"override_batch_size": 1, |
|
"max_samples": null, |
|
"job_id": "", |
|
"start_time": 2370587.736800548, |
|
"end_time": 2372824.830827315, |
|
"total_evaluation_time_secondes": "2237.0940267667174", |
|
"model_name": "Ramikan-BR/tinyllama-coder-py-v12", |
|
"model_sha": "5835856d42314f549c92bb77eb9ca3e44edd1cda", |
|
"model_dtype": "torch.float16", |
|
"model_size": "2.05 GB" |
|
}, |
|
"results": { |
|
"harness|arc:challenge|25": { |
|
"acc": 0.2858361774744027, |
|
"acc_stderr": 0.01320319608853737, |
|
"acc_norm": 0.3199658703071672, |
|
"acc_norm_stderr": 0.013631345807016193 |
|
}, |
|
"harness|hellaswag|10": { |
|
"acc": 0.41276638119896436, |
|
"acc_stderr": 0.004913253031155681, |
|
"acc_norm": 0.5361481776538538, |
|
"acc_norm_stderr": 0.004976724124850562 |
|
}, |
|
"harness|hendrycksTest-abstract_algebra|5": { |
|
"acc": 0.23, |
|
"acc_stderr": 0.04229525846816505, |
|
"acc_norm": 0.23, |
|
"acc_norm_stderr": 0.04229525846816505 |
|
}, |
|
"harness|hendrycksTest-anatomy|5": { |
|
"acc": 0.2814814814814815, |
|
"acc_stderr": 0.038850042458002526, |
|
"acc_norm": 0.2814814814814815, |
|
"acc_norm_stderr": 0.038850042458002526 |
|
}, |
|
"harness|hendrycksTest-astronomy|5": { |
|
"acc": 0.17763157894736842, |
|
"acc_stderr": 0.031103182383123387, |
|
"acc_norm": 0.17763157894736842, |
|
"acc_norm_stderr": 0.031103182383123387 |
|
}, |
|
"harness|hendrycksTest-business_ethics|5": { |
|
"acc": 0.26, |
|
"acc_stderr": 0.044084400227680794, |
|
"acc_norm": 0.26, |
|
"acc_norm_stderr": 0.044084400227680794 |
|
}, |
|
"harness|hendrycksTest-clinical_knowledge|5": { |
|
"acc": 0.24528301886792453, |
|
"acc_stderr": 0.026480357179895685, |
|
"acc_norm": 0.24528301886792453, |
|
"acc_norm_stderr": 0.026480357179895685 |
|
}, |
|
"harness|hendrycksTest-college_biology|5": { |
|
"acc": 0.2361111111111111, |
|
"acc_stderr": 0.03551446610810826, |
|
"acc_norm": 0.2361111111111111, |
|
"acc_norm_stderr": 0.03551446610810826 |
|
}, |
|
"harness|hendrycksTest-college_chemistry|5": { |
|
"acc": 0.39, |
|
"acc_stderr": 0.04902071300001974, |
|
"acc_norm": 0.39, |
|
"acc_norm_stderr": 0.04902071300001974 |
|
}, |
|
"harness|hendrycksTest-college_computer_science|5": { |
|
"acc": 0.33, |
|
"acc_stderr": 0.04725815626252604, |
|
"acc_norm": 0.33, |
|
"acc_norm_stderr": 0.04725815626252604 |
|
}, |
|
"harness|hendrycksTest-college_mathematics|5": { |
|
"acc": 0.33, |
|
"acc_stderr": 0.047258156262526045, |
|
"acc_norm": 0.33, |
|
"acc_norm_stderr": 0.047258156262526045 |
|
}, |
|
"harness|hendrycksTest-college_medicine|5": { |
|
"acc": 0.2138728323699422, |
|
"acc_stderr": 0.03126511206173043, |
|
"acc_norm": 0.2138728323699422, |
|
"acc_norm_stderr": 0.03126511206173043 |
|
}, |
|
"harness|hendrycksTest-college_physics|5": { |
|
"acc": 0.21568627450980393, |
|
"acc_stderr": 0.04092563958237655, |
|
"acc_norm": 0.21568627450980393, |
|
"acc_norm_stderr": 0.04092563958237655 |
|
}, |
|
"harness|hendrycksTest-computer_security|5": { |
|
"acc": 0.31, |
|
"acc_stderr": 0.04648231987117316, |
|
"acc_norm": 0.31, |
|
"acc_norm_stderr": 0.04648231987117316 |
|
}, |
|
"harness|hendrycksTest-conceptual_physics|5": { |
|
"acc": 0.251063829787234, |
|
"acc_stderr": 0.02834696377716245, |
|
"acc_norm": 0.251063829787234, |
|
"acc_norm_stderr": 0.02834696377716245 |
|
}, |
|
"harness|hendrycksTest-econometrics|5": { |
|
"acc": 0.2894736842105263, |
|
"acc_stderr": 0.04266339443159394, |
|
"acc_norm": 0.2894736842105263, |
|
"acc_norm_stderr": 0.04266339443159394 |
|
}, |
|
"harness|hendrycksTest-electrical_engineering|5": { |
|
"acc": 0.21379310344827587, |
|
"acc_stderr": 0.03416520447747548, |
|
"acc_norm": 0.21379310344827587, |
|
"acc_norm_stderr": 0.03416520447747548 |
|
}, |
|
"harness|hendrycksTest-elementary_mathematics|5": { |
|
"acc": 0.2222222222222222, |
|
"acc_stderr": 0.021411684393694203, |
|
"acc_norm": 0.2222222222222222, |
|
"acc_norm_stderr": 0.021411684393694203 |
|
}, |
|
"harness|hendrycksTest-formal_logic|5": { |
|
"acc": 0.21428571428571427, |
|
"acc_stderr": 0.03670066451047181, |
|
"acc_norm": 0.21428571428571427, |
|
"acc_norm_stderr": 0.03670066451047181 |
|
}, |
|
"harness|hendrycksTest-global_facts|5": { |
|
"acc": 0.3, |
|
"acc_stderr": 0.046056618647183814, |
|
"acc_norm": 0.3, |
|
"acc_norm_stderr": 0.046056618647183814 |
|
}, |
|
"harness|hendrycksTest-high_school_biology|5": { |
|
"acc": 0.2064516129032258, |
|
"acc_stderr": 0.023025899617188726, |
|
"acc_norm": 0.2064516129032258, |
|
"acc_norm_stderr": 0.023025899617188726 |
|
}, |
|
"harness|hendrycksTest-high_school_chemistry|5": { |
|
"acc": 0.1625615763546798, |
|
"acc_stderr": 0.025960300064605597, |
|
"acc_norm": 0.1625615763546798, |
|
"acc_norm_stderr": 0.025960300064605597 |
|
}, |
|
"harness|hendrycksTest-high_school_computer_science|5": { |
|
"acc": 0.3, |
|
"acc_stderr": 0.046056618647183814, |
|
"acc_norm": 0.3, |
|
"acc_norm_stderr": 0.046056618647183814 |
|
}, |
|
"harness|hendrycksTest-high_school_european_history|5": { |
|
"acc": 0.21212121212121213, |
|
"acc_stderr": 0.03192271569548299, |
|
"acc_norm": 0.21212121212121213, |
|
"acc_norm_stderr": 0.03192271569548299 |
|
}, |
|
"harness|hendrycksTest-high_school_geography|5": { |
|
"acc": 0.20707070707070707, |
|
"acc_stderr": 0.028869778460267063, |
|
"acc_norm": 0.20707070707070707, |
|
"acc_norm_stderr": 0.028869778460267063 |
|
}, |
|
"harness|hendrycksTest-high_school_government_and_politics|5": { |
|
"acc": 0.26424870466321243, |
|
"acc_stderr": 0.031821550509166484, |
|
"acc_norm": 0.26424870466321243, |
|
"acc_norm_stderr": 0.031821550509166484 |
|
}, |
|
"harness|hendrycksTest-high_school_macroeconomics|5": { |
|
"acc": 0.32051282051282054, |
|
"acc_stderr": 0.023661296393964273, |
|
"acc_norm": 0.32051282051282054, |
|
"acc_norm_stderr": 0.023661296393964273 |
|
}, |
|
"harness|hendrycksTest-high_school_mathematics|5": { |
|
"acc": 0.2814814814814815, |
|
"acc_stderr": 0.027420019350945277, |
|
"acc_norm": 0.2814814814814815, |
|
"acc_norm_stderr": 0.027420019350945277 |
|
}, |
|
"harness|hendrycksTest-high_school_microeconomics|5": { |
|
"acc": 0.22268907563025211, |
|
"acc_stderr": 0.027025433498882367, |
|
"acc_norm": 0.22268907563025211, |
|
"acc_norm_stderr": 0.027025433498882367 |
|
}, |
|
"harness|hendrycksTest-high_school_physics|5": { |
|
"acc": 0.2847682119205298, |
|
"acc_stderr": 0.03684881521389023, |
|
"acc_norm": 0.2847682119205298, |
|
"acc_norm_stderr": 0.03684881521389023 |
|
}, |
|
"harness|hendrycksTest-high_school_psychology|5": { |
|
"acc": 0.24954128440366974, |
|
"acc_stderr": 0.01855389762950161, |
|
"acc_norm": 0.24954128440366974, |
|
"acc_norm_stderr": 0.01855389762950161 |
|
}, |
|
"harness|hendrycksTest-high_school_statistics|5": { |
|
"acc": 0.4722222222222222, |
|
"acc_stderr": 0.0340470532865388, |
|
"acc_norm": 0.4722222222222222, |
|
"acc_norm_stderr": 0.0340470532865388 |
|
}, |
|
"harness|hendrycksTest-high_school_us_history|5": { |
|
"acc": 0.25980392156862747, |
|
"acc_stderr": 0.030778554678693264, |
|
"acc_norm": 0.25980392156862747, |
|
"acc_norm_stderr": 0.030778554678693264 |
|
}, |
|
"harness|hendrycksTest-high_school_world_history|5": { |
|
"acc": 0.2616033755274262, |
|
"acc_stderr": 0.028609516716994934, |
|
"acc_norm": 0.2616033755274262, |
|
"acc_norm_stderr": 0.028609516716994934 |
|
}, |
|
"harness|hendrycksTest-human_aging|5": { |
|
"acc": 0.3004484304932735, |
|
"acc_stderr": 0.030769352008229143, |
|
"acc_norm": 0.3004484304932735, |
|
"acc_norm_stderr": 0.030769352008229143 |
|
}, |
|
"harness|hendrycksTest-human_sexuality|5": { |
|
"acc": 0.26717557251908397, |
|
"acc_stderr": 0.038808483010823944, |
|
"acc_norm": 0.26717557251908397, |
|
"acc_norm_stderr": 0.038808483010823944 |
|
}, |
|
"harness|hendrycksTest-international_law|5": { |
|
"acc": 0.24793388429752067, |
|
"acc_stderr": 0.03941897526516302, |
|
"acc_norm": 0.24793388429752067, |
|
"acc_norm_stderr": 0.03941897526516302 |
|
}, |
|
"harness|hendrycksTest-jurisprudence|5": { |
|
"acc": 0.25925925925925924, |
|
"acc_stderr": 0.042365112580946336, |
|
"acc_norm": 0.25925925925925924, |
|
"acc_norm_stderr": 0.042365112580946336 |
|
}, |
|
"harness|hendrycksTest-logical_fallacies|5": { |
|
"acc": 0.2883435582822086, |
|
"acc_stderr": 0.035590395316173425, |
|
"acc_norm": 0.2883435582822086, |
|
"acc_norm_stderr": 0.035590395316173425 |
|
}, |
|
"harness|hendrycksTest-machine_learning|5": { |
|
"acc": 0.25892857142857145, |
|
"acc_stderr": 0.041577515398656284, |
|
"acc_norm": 0.25892857142857145, |
|
"acc_norm_stderr": 0.041577515398656284 |
|
}, |
|
"harness|hendrycksTest-management|5": { |
|
"acc": 0.1650485436893204, |
|
"acc_stderr": 0.036756688322331886, |
|
"acc_norm": 0.1650485436893204, |
|
"acc_norm_stderr": 0.036756688322331886 |
|
}, |
|
"harness|hendrycksTest-marketing|5": { |
|
"acc": 0.3247863247863248, |
|
"acc_stderr": 0.030679022765498835, |
|
"acc_norm": 0.3247863247863248, |
|
"acc_norm_stderr": 0.030679022765498835 |
|
}, |
|
"harness|hendrycksTest-medical_genetics|5": { |
|
"acc": 0.29, |
|
"acc_stderr": 0.045604802157206845, |
|
"acc_norm": 0.29, |
|
"acc_norm_stderr": 0.045604802157206845 |
|
}, |
|
"harness|hendrycksTest-miscellaneous|5": { |
|
"acc": 0.280970625798212, |
|
"acc_stderr": 0.01607312785122126, |
|
"acc_norm": 0.280970625798212, |
|
"acc_norm_stderr": 0.01607312785122126 |
|
}, |
|
"harness|hendrycksTest-moral_disputes|5": { |
|
"acc": 0.24855491329479767, |
|
"acc_stderr": 0.023267528432100174, |
|
"acc_norm": 0.24855491329479767, |
|
"acc_norm_stderr": 0.023267528432100174 |
|
}, |
|
"harness|hendrycksTest-moral_scenarios|5": { |
|
"acc": 0.25027932960893856, |
|
"acc_stderr": 0.01448750085285041, |
|
"acc_norm": 0.25027932960893856, |
|
"acc_norm_stderr": 0.01448750085285041 |
|
}, |
|
"harness|hendrycksTest-nutrition|5": { |
|
"acc": 0.21895424836601307, |
|
"acc_stderr": 0.02367908986180772, |
|
"acc_norm": 0.21895424836601307, |
|
"acc_norm_stderr": 0.02367908986180772 |
|
}, |
|
"harness|hendrycksTest-philosophy|5": { |
|
"acc": 0.2733118971061093, |
|
"acc_stderr": 0.025311765975426115, |
|
"acc_norm": 0.2733118971061093, |
|
"acc_norm_stderr": 0.025311765975426115 |
|
}, |
|
"harness|hendrycksTest-prehistory|5": { |
|
"acc": 0.19135802469135801, |
|
"acc_stderr": 0.021887704613396158, |
|
"acc_norm": 0.19135802469135801, |
|
"acc_norm_stderr": 0.021887704613396158 |
|
}, |
|
"harness|hendrycksTest-professional_accounting|5": { |
|
"acc": 0.24113475177304963, |
|
"acc_stderr": 0.02551873104953776, |
|
"acc_norm": 0.24113475177304963, |
|
"acc_norm_stderr": 0.02551873104953776 |
|
}, |
|
"harness|hendrycksTest-professional_law|5": { |
|
"acc": 0.24445893089960888, |
|
"acc_stderr": 0.010976425013113897, |
|
"acc_norm": 0.24445893089960888, |
|
"acc_norm_stderr": 0.010976425013113897 |
|
}, |
|
"harness|hendrycksTest-professional_medicine|5": { |
|
"acc": 0.4485294117647059, |
|
"acc_stderr": 0.030211479609121593, |
|
"acc_norm": 0.4485294117647059, |
|
"acc_norm_stderr": 0.030211479609121593 |
|
}, |
|
"harness|hendrycksTest-professional_psychology|5": { |
|
"acc": 0.21568627450980393, |
|
"acc_stderr": 0.01663931935031326, |
|
"acc_norm": 0.21568627450980393, |
|
"acc_norm_stderr": 0.01663931935031326 |
|
}, |
|
"harness|hendrycksTest-public_relations|5": { |
|
"acc": 0.23636363636363636, |
|
"acc_stderr": 0.04069306319721376, |
|
"acc_norm": 0.23636363636363636, |
|
"acc_norm_stderr": 0.04069306319721376 |
|
}, |
|
"harness|hendrycksTest-security_studies|5": { |
|
"acc": 0.2571428571428571, |
|
"acc_stderr": 0.02797982353874455, |
|
"acc_norm": 0.2571428571428571, |
|
"acc_norm_stderr": 0.02797982353874455 |
|
}, |
|
"harness|hendrycksTest-sociology|5": { |
|
"acc": 0.2537313432835821, |
|
"acc_stderr": 0.030769444967296018, |
|
"acc_norm": 0.2537313432835821, |
|
"acc_norm_stderr": 0.030769444967296018 |
|
}, |
|
"harness|hendrycksTest-us_foreign_policy|5": { |
|
"acc": 0.23, |
|
"acc_stderr": 0.04229525846816506, |
|
"acc_norm": 0.23, |
|
"acc_norm_stderr": 0.04229525846816506 |
|
}, |
|
"harness|hendrycksTest-virology|5": { |
|
"acc": 0.26506024096385544, |
|
"acc_stderr": 0.03436024037944967, |
|
"acc_norm": 0.26506024096385544, |
|
"acc_norm_stderr": 0.03436024037944967 |
|
}, |
|
"harness|hendrycksTest-world_religions|5": { |
|
"acc": 0.3216374269005848, |
|
"acc_stderr": 0.03582529442573122, |
|
"acc_norm": 0.3216374269005848, |
|
"acc_norm_stderr": 0.03582529442573122 |
|
}, |
|
"harness|truthfulqa:mc|0": { |
|
"mc1": 0.25458996328029376, |
|
"mc1_stderr": 0.015250117079156496, |
|
"mc2": 0.4091007666951377, |
|
"mc2_stderr": 0.014365367143474025 |
|
}, |
|
"harness|winogrande|5": { |
|
"acc": 0.5706393054459353, |
|
"acc_stderr": 0.013911537499969163 |
|
}, |
|
"harness|gsm8k|5": { |
|
"acc": 0.014404852160727824, |
|
"acc_stderr": 0.0032820559171369444 |
|
}, |
|
"all": { |
|
"acc": 0.26686435788728485, |
|
"acc_stderr": 0.031169507718254618, |
|
"acc_norm": 0.2686642908950062, |
|
"acc_norm_stderr": 0.03194301691878982, |
|
"mc1": 0.25458996328029376, |
|
"mc1_stderr": 0.015250117079156496, |
|
"mc2": 0.4091007666951377, |
|
"mc2_stderr": 0.014365367143474025 |
|
} |
|
}, |
|
"versions": { |
|
"all": 0, |
|
"harness|arc:challenge|25": 0, |
|
"harness|gsm8k|5": 0, |
|
"harness|hellaswag|10": 0, |
|
"harness|hendrycksTest-abstract_algebra|5": 1, |
|
"harness|hendrycksTest-anatomy|5": 1, |
|
"harness|hendrycksTest-astronomy|5": 1, |
|
"harness|hendrycksTest-business_ethics|5": 1, |
|
"harness|hendrycksTest-clinical_knowledge|5": 1, |
|
"harness|hendrycksTest-college_biology|5": 1, |
|
"harness|hendrycksTest-college_chemistry|5": 1, |
|
"harness|hendrycksTest-college_computer_science|5": 1, |
|
"harness|hendrycksTest-college_mathematics|5": 1, |
|
"harness|hendrycksTest-college_medicine|5": 1, |
|
"harness|hendrycksTest-college_physics|5": 1, |
|
"harness|hendrycksTest-computer_security|5": 1, |
|
"harness|hendrycksTest-conceptual_physics|5": 1, |
|
"harness|hendrycksTest-econometrics|5": 1, |
|
"harness|hendrycksTest-electrical_engineering|5": 1, |
|
"harness|hendrycksTest-elementary_mathematics|5": 1, |
|
"harness|hendrycksTest-formal_logic|5": 1, |
|
"harness|hendrycksTest-global_facts|5": 1, |
|
"harness|hendrycksTest-high_school_biology|5": 1, |
|
"harness|hendrycksTest-high_school_chemistry|5": 1, |
|
"harness|hendrycksTest-high_school_computer_science|5": 1, |
|
"harness|hendrycksTest-high_school_european_history|5": 1, |
|
"harness|hendrycksTest-high_school_geography|5": 1, |
|
"harness|hendrycksTest-high_school_government_and_politics|5": 1, |
|
"harness|hendrycksTest-high_school_macroeconomics|5": 1, |
|
"harness|hendrycksTest-high_school_mathematics|5": 1, |
|
"harness|hendrycksTest-high_school_microeconomics|5": 1, |
|
"harness|hendrycksTest-high_school_physics|5": 1, |
|
"harness|hendrycksTest-high_school_psychology|5": 1, |
|
"harness|hendrycksTest-high_school_statistics|5": 1, |
|
"harness|hendrycksTest-high_school_us_history|5": 1, |
|
"harness|hendrycksTest-high_school_world_history|5": 1, |
|
"harness|hendrycksTest-human_aging|5": 1, |
|
"harness|hendrycksTest-human_sexuality|5": 1, |
|
"harness|hendrycksTest-international_law|5": 1, |
|
"harness|hendrycksTest-jurisprudence|5": 1, |
|
"harness|hendrycksTest-logical_fallacies|5": 1, |
|
"harness|hendrycksTest-machine_learning|5": 1, |
|
"harness|hendrycksTest-management|5": 1, |
|
"harness|hendrycksTest-marketing|5": 1, |
|
"harness|hendrycksTest-medical_genetics|5": 1, |
|
"harness|hendrycksTest-miscellaneous|5": 1, |
|
"harness|hendrycksTest-moral_disputes|5": 1, |
|
"harness|hendrycksTest-moral_scenarios|5": 1, |
|
"harness|hendrycksTest-nutrition|5": 1, |
|
"harness|hendrycksTest-philosophy|5": 1, |
|
"harness|hendrycksTest-prehistory|5": 1, |
|
"harness|hendrycksTest-professional_accounting|5": 1, |
|
"harness|hendrycksTest-professional_law|5": 1, |
|
"harness|hendrycksTest-professional_medicine|5": 1, |
|
"harness|hendrycksTest-professional_psychology|5": 1, |
|
"harness|hendrycksTest-public_relations|5": 1, |
|
"harness|hendrycksTest-security_studies|5": 1, |
|
"harness|hendrycksTest-sociology|5": 1, |
|
"harness|hendrycksTest-us_foreign_policy|5": 1, |
|
"harness|hendrycksTest-virology|5": 1, |
|
"harness|hendrycksTest-world_religions|5": 1, |
|
"harness|truthfulqa:mc|0": 1, |
|
"harness|winogrande|5": 0 |
|
}, |
|
"config_tasks": { |
|
"harness|arc:challenge": "LM Harness task", |
|
"harness|gsm8k": "LM Harness task", |
|
"harness|hellaswag": "LM Harness task", |
|
"harness|hendrycksTest-abstract_algebra": "LM Harness task", |
|
"harness|hendrycksTest-anatomy": "LM Harness task", |
|
"harness|hendrycksTest-astronomy": "LM Harness task", |
|
"harness|hendrycksTest-business_ethics": "LM Harness task", |
|
"harness|hendrycksTest-clinical_knowledge": "LM Harness task", |
|
"harness|hendrycksTest-college_biology": "LM Harness task", |
|
"harness|hendrycksTest-college_chemistry": "LM Harness task", |
|
"harness|hendrycksTest-college_computer_science": "LM Harness task", |
|
"harness|hendrycksTest-college_mathematics": "LM Harness task", |
|
"harness|hendrycksTest-college_medicine": "LM Harness task", |
|
"harness|hendrycksTest-college_physics": "LM Harness task", |
|
"harness|hendrycksTest-computer_security": "LM Harness task", |
|
"harness|hendrycksTest-conceptual_physics": "LM Harness task", |
|
"harness|hendrycksTest-econometrics": "LM Harness task", |
|
"harness|hendrycksTest-electrical_engineering": "LM Harness task", |
|
"harness|hendrycksTest-elementary_mathematics": "LM Harness task", |
|
"harness|hendrycksTest-formal_logic": "LM Harness task", |
|
"harness|hendrycksTest-global_facts": "LM Harness task", |
|
"harness|hendrycksTest-high_school_biology": "LM Harness task", |
|
"harness|hendrycksTest-high_school_chemistry": "LM Harness task", |
|
"harness|hendrycksTest-high_school_computer_science": "LM Harness task", |
|
"harness|hendrycksTest-high_school_european_history": "LM Harness task", |
|
"harness|hendrycksTest-high_school_geography": "LM Harness task", |
|
"harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", |
|
"harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", |
|
"harness|hendrycksTest-high_school_mathematics": "LM Harness task", |
|
"harness|hendrycksTest-high_school_microeconomics": "LM Harness task", |
|
"harness|hendrycksTest-high_school_physics": "LM Harness task", |
|
"harness|hendrycksTest-high_school_psychology": "LM Harness task", |
|
"harness|hendrycksTest-high_school_statistics": "LM Harness task", |
|
"harness|hendrycksTest-high_school_us_history": "LM Harness task", |
|
"harness|hendrycksTest-high_school_world_history": "LM Harness task", |
|
"harness|hendrycksTest-human_aging": "LM Harness task", |
|
"harness|hendrycksTest-human_sexuality": "LM Harness task", |
|
"harness|hendrycksTest-international_law": "LM Harness task", |
|
"harness|hendrycksTest-jurisprudence": "LM Harness task", |
|
"harness|hendrycksTest-logical_fallacies": "LM Harness task", |
|
"harness|hendrycksTest-machine_learning": "LM Harness task", |
|
"harness|hendrycksTest-management": "LM Harness task", |
|
"harness|hendrycksTest-marketing": "LM Harness task", |
|
"harness|hendrycksTest-medical_genetics": "LM Harness task", |
|
"harness|hendrycksTest-miscellaneous": "LM Harness task", |
|
"harness|hendrycksTest-moral_disputes": "LM Harness task", |
|
"harness|hendrycksTest-moral_scenarios": "LM Harness task", |
|
"harness|hendrycksTest-nutrition": "LM Harness task", |
|
"harness|hendrycksTest-philosophy": "LM Harness task", |
|
"harness|hendrycksTest-prehistory": "LM Harness task", |
|
"harness|hendrycksTest-professional_accounting": "LM Harness task", |
|
"harness|hendrycksTest-professional_law": "LM Harness task", |
|
"harness|hendrycksTest-professional_medicine": "LM Harness task", |
|
"harness|hendrycksTest-professional_psychology": "LM Harness task", |
|
"harness|hendrycksTest-public_relations": "LM Harness task", |
|
"harness|hendrycksTest-security_studies": "LM Harness task", |
|
"harness|hendrycksTest-sociology": "LM Harness task", |
|
"harness|hendrycksTest-us_foreign_policy": "LM Harness task", |
|
"harness|hendrycksTest-virology": "LM Harness task", |
|
"harness|hendrycksTest-world_religions": "LM Harness task", |
|
"harness|truthfulqa:mc": "LM Harness task", |
|
"harness|winogrande": "LM Harness task" |
|
}, |
|
"summary_tasks": { |
|
"harness|arc:challenge|25": { |
|
"hashes": { |
|
"hash_examples": "17b0cae357c0259e", |
|
"hash_full_prompts": "045cbb916e5145c6", |
|
"hash_input_tokens": "ca48d52265c0051f", |
|
"hash_cont_tokens": "e8abf848493b50f7" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 1172, |
|
"padded": 4687, |
|
"non_padded": 0, |
|
"effective_few_shots": 25.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hellaswag|10": { |
|
"hashes": { |
|
"hash_examples": "e1768ecb99d7ecf0", |
|
"hash_full_prompts": "0b4c16983130f84f", |
|
"hash_input_tokens": "4975ded0ed31f702", |
|
"hash_cont_tokens": "9fe0a5c42e1532db" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 10042, |
|
"padded": 40019, |
|
"non_padded": 149, |
|
"effective_few_shots": 10.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-abstract_algebra|5": { |
|
"hashes": { |
|
"hash_examples": "280f9f325b40559a", |
|
"hash_full_prompts": "2f776a367d23aea2", |
|
"hash_input_tokens": "8ff523ec326d5d55", |
|
"hash_cont_tokens": "50421e30bef398f9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-anatomy|5": { |
|
"hashes": { |
|
"hash_examples": "2f83a4f1cab4ba18", |
|
"hash_full_prompts": "516f74bef25df620", |
|
"hash_input_tokens": "742bd6a389a8ef40", |
|
"hash_cont_tokens": "f11971a765cb609f" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 135, |
|
"padded": 540, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-astronomy|5": { |
|
"hashes": { |
|
"hash_examples": "7d587b908da4d762", |
|
"hash_full_prompts": "faf4e80f65de93ca", |
|
"hash_input_tokens": "aa9743839c83bd9f", |
|
"hash_cont_tokens": "440a970fadecdc7b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 152, |
|
"padded": 608, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-business_ethics|5": { |
|
"hashes": { |
|
"hash_examples": "33e51740670de686", |
|
"hash_full_prompts": "db01c3ef8e1479d4", |
|
"hash_input_tokens": "60f6ed52e2a2987a", |
|
"hash_cont_tokens": "50421e30bef398f9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-clinical_knowledge|5": { |
|
"hashes": { |
|
"hash_examples": "f3366dbe7eefffa4", |
|
"hash_full_prompts": "49654f71d94b65c3", |
|
"hash_input_tokens": "6080d9f3c5930be0", |
|
"hash_cont_tokens": "7ecd60c25b9bfe5b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 265, |
|
"padded": 1060, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-college_biology|5": { |
|
"hashes": { |
|
"hash_examples": "ca2b6753a0193e7f", |
|
"hash_full_prompts": "2b460b75f1fdfefd", |
|
"hash_input_tokens": "873319724ad65589", |
|
"hash_cont_tokens": "875cde3af7a0ee14" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 144, |
|
"padded": 564, |
|
"non_padded": 12, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-college_chemistry|5": { |
|
"hashes": { |
|
"hash_examples": "22ff85f1d34f42d1", |
|
"hash_full_prompts": "242c9be6da583e95", |
|
"hash_input_tokens": "8366d04d12b154a7", |
|
"hash_cont_tokens": "50421e30bef398f9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-college_computer_science|5": { |
|
"hashes": { |
|
"hash_examples": "30318289d717a5cf", |
|
"hash_full_prompts": "ed2bdb4e87c4b371", |
|
"hash_input_tokens": "1724a282fb269fd7", |
|
"hash_cont_tokens": "50421e30bef398f9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-college_mathematics|5": { |
|
"hashes": { |
|
"hash_examples": "4944d1f0b6b5d911", |
|
"hash_full_prompts": "770bc4281c973190", |
|
"hash_input_tokens": "b7aa815781eae172", |
|
"hash_cont_tokens": "50421e30bef398f9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-college_medicine|5": { |
|
"hashes": { |
|
"hash_examples": "dd69cc33381275af", |
|
"hash_full_prompts": "ad2a53e5250ab46e", |
|
"hash_input_tokens": "0003d13e86bc8c1a", |
|
"hash_cont_tokens": "702fb6d82ff0d6ac" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 173, |
|
"padded": 692, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-college_physics|5": { |
|
"hashes": { |
|
"hash_examples": "875dd26d22655b0d", |
|
"hash_full_prompts": "833a0d7b55aed500", |
|
"hash_input_tokens": "32b28762dd077c78", |
|
"hash_cont_tokens": "f7b8097afc16a47c" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 102, |
|
"padded": 404, |
|
"non_padded": 4, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-computer_security|5": { |
|
"hashes": { |
|
"hash_examples": "006451eedc0ededb", |
|
"hash_full_prompts": "94034c97e85d8f46", |
|
"hash_input_tokens": "19dd0e1895125d49", |
|
"hash_cont_tokens": "50421e30bef398f9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-conceptual_physics|5": { |
|
"hashes": { |
|
"hash_examples": "8874ece872d2ca4c", |
|
"hash_full_prompts": "e40d15a34640d6fa", |
|
"hash_input_tokens": "761c7ce187b3338a", |
|
"hash_cont_tokens": "aa0e8bc655f2f641" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 235, |
|
"padded": 940, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-econometrics|5": { |
|
"hashes": { |
|
"hash_examples": "64d3623b0bfaa43f", |
|
"hash_full_prompts": "612f340fae41338d", |
|
"hash_input_tokens": "dae74024ebc12b2b", |
|
"hash_cont_tokens": "b1cc6e7e9fcd3827" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 114, |
|
"padded": 456, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-electrical_engineering|5": { |
|
"hashes": { |
|
"hash_examples": "e98f51780c674d7e", |
|
"hash_full_prompts": "10275b312d812ae6", |
|
"hash_input_tokens": "5fa8050688a246ed", |
|
"hash_cont_tokens": "2425a3f084a591ef" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 145, |
|
"padded": 580, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-elementary_mathematics|5": { |
|
"hashes": { |
|
"hash_examples": "fc48208a5ac1c0ce", |
|
"hash_full_prompts": "5ec274c6c82aca23", |
|
"hash_input_tokens": "2da3f8d7d1515cc6", |
|
"hash_cont_tokens": "bd87bf0c060fd925" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 378, |
|
"padded": 1512, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-formal_logic|5": { |
|
"hashes": { |
|
"hash_examples": "5a6525665f63ea72", |
|
"hash_full_prompts": "07b92638c4a6b500", |
|
"hash_input_tokens": "907de61bbe46dada", |
|
"hash_cont_tokens": "eb8932890e0605db" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 126, |
|
"padded": 504, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-global_facts|5": { |
|
"hashes": { |
|
"hash_examples": "371d70d743b2b89b", |
|
"hash_full_prompts": "332fdee50a1921b4", |
|
"hash_input_tokens": "d7549fe9ac133643", |
|
"hash_cont_tokens": "50421e30bef398f9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_biology|5": { |
|
"hashes": { |
|
"hash_examples": "a79e1018b1674052", |
|
"hash_full_prompts": "e624e26ede922561", |
|
"hash_input_tokens": "b449ae8cd622fb96", |
|
"hash_cont_tokens": "1ddcb86d28cde266" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 310, |
|
"padded": 1240, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_chemistry|5": { |
|
"hashes": { |
|
"hash_examples": "44bfc25c389f0e03", |
|
"hash_full_prompts": "0e3e5f5d9246482a", |
|
"hash_input_tokens": "a447bd1574b5e26c", |
|
"hash_cont_tokens": "176c8dcff38c5f8f" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 203, |
|
"padded": 812, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_computer_science|5": { |
|
"hashes": { |
|
"hash_examples": "8b8cdb1084f24169", |
|
"hash_full_prompts": "c00487e67c1813cc", |
|
"hash_input_tokens": "56312a0c3d85ae90", |
|
"hash_cont_tokens": "50421e30bef398f9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_european_history|5": { |
|
"hashes": { |
|
"hash_examples": "11cd32d0ef440171", |
|
"hash_full_prompts": "318f4513c537c6bf", |
|
"hash_input_tokens": "5002f4ac8b1562ca", |
|
"hash_cont_tokens": "674fc454bdc5ac93" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 165, |
|
"padded": 656, |
|
"non_padded": 4, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_geography|5": { |
|
"hashes": { |
|
"hash_examples": "b60019b9e80b642f", |
|
"hash_full_prompts": "ee5789fcc1a81b1e", |
|
"hash_input_tokens": "b4f9efd054b0149d", |
|
"hash_cont_tokens": "03a5012b916274ea" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 198, |
|
"padded": 792, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_government_and_politics|5": { |
|
"hashes": { |
|
"hash_examples": "d221ec983d143dc3", |
|
"hash_full_prompts": "ac42d888e1ce1155", |
|
"hash_input_tokens": "6e010d01707b5a01", |
|
"hash_cont_tokens": "873d2aab226ba1d8" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 193, |
|
"padded": 772, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_macroeconomics|5": { |
|
"hashes": { |
|
"hash_examples": "59c2915cacfd3fbb", |
|
"hash_full_prompts": "c6bd9d25158abd0e", |
|
"hash_input_tokens": "fc1f6e824ba386d7", |
|
"hash_cont_tokens": "c583432ad27fcfe0" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 390, |
|
"padded": 1560, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_mathematics|5": { |
|
"hashes": { |
|
"hash_examples": "1f8ac897608de342", |
|
"hash_full_prompts": "5d88f41fc2d643a8", |
|
"hash_input_tokens": "3a485a40c8432ece", |
|
"hash_cont_tokens": "d7907b61bcb8c123" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 270, |
|
"padded": 1080, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_microeconomics|5": { |
|
"hashes": { |
|
"hash_examples": "ead6a0f2f6c83370", |
|
"hash_full_prompts": "bfc393381298609e", |
|
"hash_input_tokens": "a7dd9ca4bbda3752", |
|
"hash_cont_tokens": "f47f041de50333b9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 238, |
|
"padded": 952, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_physics|5": { |
|
"hashes": { |
|
"hash_examples": "c3f2025990afec64", |
|
"hash_full_prompts": "fc78b4997e436734", |
|
"hash_input_tokens": "d7ea631399a73865", |
|
"hash_cont_tokens": "0d56317b3e5eedb5" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 151, |
|
"padded": 604, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_psychology|5": { |
|
"hashes": { |
|
"hash_examples": "21f8aab618f6d636", |
|
"hash_full_prompts": "d5c76aa40b9dbc43", |
|
"hash_input_tokens": "d12816cf88146011", |
|
"hash_cont_tokens": "09ba1243e7390c0f" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 545, |
|
"padded": 2180, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_statistics|5": { |
|
"hashes": { |
|
"hash_examples": "2386a60a11fc5de3", |
|
"hash_full_prompts": "4c5c8be5aafac432", |
|
"hash_input_tokens": "9763ecaef4814c21", |
|
"hash_cont_tokens": "9cc29889c3d3f77d" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 216, |
|
"padded": 864, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_us_history|5": { |
|
"hashes": { |
|
"hash_examples": "74961543be40f04f", |
|
"hash_full_prompts": "5d5ca4840131ba21", |
|
"hash_input_tokens": "c639cce12a46ebad", |
|
"hash_cont_tokens": "cdd0b3dc06d933e5" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 204, |
|
"padded": 816, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-high_school_world_history|5": { |
|
"hashes": { |
|
"hash_examples": "2ad2f6b7198b2234", |
|
"hash_full_prompts": "11845057459afd72", |
|
"hash_input_tokens": "b9762065cce6f3a6", |
|
"hash_cont_tokens": "e02816433ff28daf" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 237, |
|
"padded": 948, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-human_aging|5": { |
|
"hashes": { |
|
"hash_examples": "1a7199dc733e779b", |
|
"hash_full_prompts": "756b9096b8eaf892", |
|
"hash_input_tokens": "84157fee0b6d0f3c", |
|
"hash_cont_tokens": "142a4a8a1138a214" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 223, |
|
"padded": 892, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-human_sexuality|5": { |
|
"hashes": { |
|
"hash_examples": "7acb8fdad97f88a6", |
|
"hash_full_prompts": "731a52ff15b8cfdb", |
|
"hash_input_tokens": "ade303e1ae3c016f", |
|
"hash_cont_tokens": "bc54813e809b796d" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 131, |
|
"padded": 524, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-international_law|5": { |
|
"hashes": { |
|
"hash_examples": "1300bfd0dfc59114", |
|
"hash_full_prompts": "db2aefbff5eec996", |
|
"hash_input_tokens": "e5482e1c23c23d35", |
|
"hash_cont_tokens": "8ea8c5ff76a15bca" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 121, |
|
"padded": 484, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-jurisprudence|5": { |
|
"hashes": { |
|
"hash_examples": "083b1e4904c48dc2", |
|
"hash_full_prompts": "0f89ee3fe03d6a21", |
|
"hash_input_tokens": "4415eeb9bad0507b", |
|
"hash_cont_tokens": "e3a8cd951b6e3469" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 108, |
|
"padded": 432, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-logical_fallacies|5": { |
|
"hashes": { |
|
"hash_examples": "709128f9926a634c", |
|
"hash_full_prompts": "98a04b1f8f841069", |
|
"hash_input_tokens": "e6b5271422ecbaa8", |
|
"hash_cont_tokens": "3e9e0bdc248fd88a" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 163, |
|
"padded": 644, |
|
"non_padded": 8, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-machine_learning|5": { |
|
"hashes": { |
|
"hash_examples": "88f22a636029ae47", |
|
"hash_full_prompts": "2e1c8d4b1e0cc921", |
|
"hash_input_tokens": "e719cb83196977d8", |
|
"hash_cont_tokens": "55b12fb138c6a064" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 112, |
|
"padded": 448, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-management|5": { |
|
"hashes": { |
|
"hash_examples": "8c8a1e07a2151dca", |
|
"hash_full_prompts": "f51611f514b265b0", |
|
"hash_input_tokens": "155da0e62b39e804", |
|
"hash_cont_tokens": "a01d6d39a83c4597" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 103, |
|
"padded": 412, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-marketing|5": { |
|
"hashes": { |
|
"hash_examples": "2668953431f91e96", |
|
"hash_full_prompts": "77562bef997c7650", |
|
"hash_input_tokens": "38466c242259e6d3", |
|
"hash_cont_tokens": "6aeaed4d823c98aa" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 234, |
|
"padded": 932, |
|
"non_padded": 4, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-medical_genetics|5": { |
|
"hashes": { |
|
"hash_examples": "9c2dda34a2ea4fd2", |
|
"hash_full_prompts": "202139046daa118f", |
|
"hash_input_tokens": "0dd129e92538a7f6", |
|
"hash_cont_tokens": "50421e30bef398f9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-miscellaneous|5": { |
|
"hashes": { |
|
"hash_examples": "41adb694024809c2", |
|
"hash_full_prompts": "bffec9fc237bcf93", |
|
"hash_input_tokens": "d108a883fc3e022f", |
|
"hash_cont_tokens": "9b0ab02a64603081" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 783, |
|
"padded": 3132, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-moral_disputes|5": { |
|
"hashes": { |
|
"hash_examples": "3171c13ba3c594c4", |
|
"hash_full_prompts": "170831fc36f1d59e", |
|
"hash_input_tokens": "0e7b7df82884a2d5", |
|
"hash_cont_tokens": "3b8bbe9108e55ce9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 346, |
|
"padded": 1364, |
|
"non_padded": 20, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-moral_scenarios|5": { |
|
"hashes": { |
|
"hash_examples": "9873e077e83e0546", |
|
"hash_full_prompts": "08f4ceba3131a068", |
|
"hash_input_tokens": "7c220f5613cd8426", |
|
"hash_cont_tokens": "3e9bfc0362e97330" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 895, |
|
"padded": 3580, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-nutrition|5": { |
|
"hashes": { |
|
"hash_examples": "7db1d8142ec14323", |
|
"hash_full_prompts": "4c0e68e3586cb453", |
|
"hash_input_tokens": "35de1609a9a763a9", |
|
"hash_cont_tokens": "23b2dc6ee2da4cfc" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 306, |
|
"padded": 1224, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-philosophy|5": { |
|
"hashes": { |
|
"hash_examples": "9b455b7d72811cc8", |
|
"hash_full_prompts": "e467f822d8a0d3ff", |
|
"hash_input_tokens": "a1dcfa9c80490d06", |
|
"hash_cont_tokens": "9f6ff69d23a48783" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 311, |
|
"padded": 1244, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-prehistory|5": { |
|
"hashes": { |
|
"hash_examples": "8be90d0f538f1560", |
|
"hash_full_prompts": "152187949bcd0921", |
|
"hash_input_tokens": "a091cf645d2415e0", |
|
"hash_cont_tokens": "d6458d743d875837" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 324, |
|
"padded": 1296, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-professional_accounting|5": { |
|
"hashes": { |
|
"hash_examples": "8d377597916cd07e", |
|
"hash_full_prompts": "0eb7345d6144ee0d", |
|
"hash_input_tokens": "e9df32a33f85290c", |
|
"hash_cont_tokens": "922a195f53a35662" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 282, |
|
"padded": 1128, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-professional_law|5": { |
|
"hashes": { |
|
"hash_examples": "cd9dbc52b3c932d6", |
|
"hash_full_prompts": "36ac764272bfb182", |
|
"hash_input_tokens": "c9f7583fff66d361", |
|
"hash_cont_tokens": "2e590029ef41fbcd" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 1534, |
|
"padded": 6136, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-professional_medicine|5": { |
|
"hashes": { |
|
"hash_examples": "b20e4e816c1e383e", |
|
"hash_full_prompts": "7b8d69ea2acaf2f7", |
|
"hash_input_tokens": "40a933f829116f8d", |
|
"hash_cont_tokens": "7cfee54dbddd5a98" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 272, |
|
"padded": 1088, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-professional_psychology|5": { |
|
"hashes": { |
|
"hash_examples": "d45b73b22f9cc039", |
|
"hash_full_prompts": "fe8937e9ffc99771", |
|
"hash_input_tokens": "0f6a92c3a2062b48", |
|
"hash_cont_tokens": "a86677b2a45c20e1" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 612, |
|
"padded": 2448, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-public_relations|5": { |
|
"hashes": { |
|
"hash_examples": "0d25072e1761652a", |
|
"hash_full_prompts": "f9adc39cfa9f42ba", |
|
"hash_input_tokens": "29a08e9bfbe9b2f0", |
|
"hash_cont_tokens": "0d756ccaae031757" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 110, |
|
"padded": 440, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-security_studies|5": { |
|
"hashes": { |
|
"hash_examples": "62bb8197e63d60d4", |
|
"hash_full_prompts": "869c9c3ae196b7c3", |
|
"hash_input_tokens": "32a03f1f22a6e103", |
|
"hash_cont_tokens": "b2229bc2cfbf594b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 245, |
|
"padded": 980, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-sociology|5": { |
|
"hashes": { |
|
"hash_examples": "e7959df87dea8672", |
|
"hash_full_prompts": "1a1fc00e17b3a52a", |
|
"hash_input_tokens": "1de5c52d2b2831d7", |
|
"hash_cont_tokens": "c3a3bdfd177eed5b" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 201, |
|
"padded": 800, |
|
"non_padded": 4, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-us_foreign_policy|5": { |
|
"hashes": { |
|
"hash_examples": "4a56a01ddca44dca", |
|
"hash_full_prompts": "0c7a7081c71c07b6", |
|
"hash_input_tokens": "add924961f7f4146", |
|
"hash_cont_tokens": "50421e30bef398f9" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 100, |
|
"padded": 400, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-virology|5": { |
|
"hashes": { |
|
"hash_examples": "451cc86a8c4f4fe9", |
|
"hash_full_prompts": "01e95325d8b738e4", |
|
"hash_input_tokens": "e0653601c466b1bc", |
|
"hash_cont_tokens": "af8b3658088cb37f" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 166, |
|
"padded": 664, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|hendrycksTest-world_religions|5": { |
|
"hashes": { |
|
"hash_examples": "3b29cfaf1a81c379", |
|
"hash_full_prompts": "e0d79a15083dfdff", |
|
"hash_input_tokens": "ac600d612445156d", |
|
"hash_cont_tokens": "060118bef6de4e0a" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 171, |
|
"padded": 684, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|truthfulqa:mc|0": { |
|
"hashes": { |
|
"hash_examples": "23176c0531c7b867", |
|
"hash_full_prompts": "36a6d90e75d92d4a", |
|
"hash_input_tokens": "a03ce28b7fd06aa7", |
|
"hash_cont_tokens": "f5da56a132aab151" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 817, |
|
"padded": 9996, |
|
"non_padded": 0, |
|
"effective_few_shots": 0.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|winogrande|5": { |
|
"hashes": { |
|
"hash_examples": "aada0a176fd81218", |
|
"hash_full_prompts": "c8655cbd12de8409", |
|
"hash_input_tokens": "72067255e368e24e", |
|
"hash_cont_tokens": "f08975ad6f2d5864" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 1267, |
|
"padded": 2534, |
|
"non_padded": 0, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
}, |
|
"harness|gsm8k|5": { |
|
"hashes": { |
|
"hash_examples": "4c0843a5d99bcfdc", |
|
"hash_full_prompts": "41d55e83abc0e02d", |
|
"hash_input_tokens": "bda342e47b5099b2", |
|
"hash_cont_tokens": "e4101d08d98273ca" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 1319, |
|
"padded": 0, |
|
"non_padded": 1319, |
|
"effective_few_shots": 5.0, |
|
"num_truncated_few_shots": 0 |
|
} |
|
}, |
|
"summary_general": { |
|
"hashes": { |
|
"hash_examples": "3b7fa57a057f9415", |
|
"hash_full_prompts": "63615fc50fc9417c", |
|
"hash_input_tokens": "a8fa53915153e1db", |
|
"hash_cont_tokens": "c3c012687e8b60d2" |
|
}, |
|
"truncated": 0, |
|
"non_truncated": 28659, |
|
"padded": 113348, |
|
"non_padded": 1524, |
|
"num_truncated_few_shots": 0 |
|
} |
|
} |