Spaces:
Runtime error
Runtime error
Upload scandeval_benchmark_results_mimir_extended_scratch.jsonl
Browse files
scandeval_benchmark_results_mimir_extended_scratch.jsonl
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
{"dataset": "norec", "task": "sentiment-classification", "dataset_languages": ["nb", "nn", "no"], "model": "mimir-project/mimir-mistral-7b-extended-scratch", "results": {"raw": {"test": [{"mcc": 0.38266734646672007, "macro_f1": 0.5913100588384671}, {"mcc": 0.3564318911196257, "macro_f1": 0.5531231690438897}, {"mcc": 0.3589726992629275, "macro_f1": 0.5738696586462754}, {"mcc": 0.32312958617947884, "macro_f1": 0.5082993600392668}, {"mcc": 0.2774396338972279, "macro_f1": 0.5337954104081843}, {"mcc": 0.3041760705547659, "macro_f1": 0.5073430883044556}, {"mcc": 0.3909514790988849, "macro_f1": 0.5897330787984293}, {"mcc": 0.2941285455706591, "macro_f1": 0.5429286070973035}, {"mcc": 0.3546180790178912, "macro_f1": 0.5366149522996762}, {"mcc": 0.3122202180870442, "macro_f1": 0.45841551264589303}]}, "total": {"test_mcc": 33.547355492552256, "test_mcc_se": 2.3927523256655734, "test_macro_f1": 53.95432896121842, "test_macro_f1_se": 2.5479457237781875}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 32768, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.10.0"}
|
3 |
+
{"dataset": "norne-nb", "task": "named-entity-recognition", "dataset_languages": ["nb", "no"], "model": "mimir-project/mimir-mistral-7b-extended-scratch", "results": {"raw": {"test": [{"micro_f1_no_misc": 0.0, "micro_f1": 0.0}, {"micro_f1_no_misc": 0.0017849174475680501, "micro_f1": 0.0023734177215189874}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0030097817908201654}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0}, {"micro_f1_no_misc": 0.000794912559618442, "micro_f1": 0.0007155635062611807}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0007393715341959334}, {"micro_f1_no_misc": 0.001601922306768122, "micro_f1": 0.002216475803472479}, {"micro_f1_no_misc": 0.013009540329575022, "micro_f1": 0.016784155756965423}, {"micro_f1_no_misc": 0.0, "micro_f1": 0.0}]}, "total": {"test_micro_f1_no_misc": 0.17191292643529635, "test_micro_f1_no_misc_se": 0.2497050398784199, "test_micro_f1": 0.2583876611323417, "test_micro_f1_se": 0.31712389849800154}}, "num_model_parameters": -1, "max_sequence_length": 2173, "vocabulary_size": 32768, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.10.0"}
|
4 |
+
{"dataset": "scala-nb", "task": "linguistic-acceptability", "dataset_languages": ["nb", "no"], "model": "mimir-project/mimir-mistral-7b-extended-scratch", "results": {"raw": {"test": [{"mcc": 0.03436161719096488, "macro_f1": 0.5159719146717723}, {"mcc": -0.0064039419745103646, "macro_f1": 0.47782617897944013}, {"mcc": 0.007068464828426508, "macro_f1": 0.4339108305348071}, {"mcc": -0.00041692996286816196, "macro_f1": 0.455918995798429}, {"mcc": 0.003915306957592845, "macro_f1": 0.4605041073991091}, {"mcc": -0.022512816557776014, "macro_f1": 0.37442525963795376}, {"mcc": 0.025631389373638038, "macro_f1": 0.47866872478132527}, {"mcc": 0.006700900292016499, "macro_f1": 0.49937853405322713}, {"mcc": 0.016822389234337917, "macro_f1": 0.5057110018502062}, {"mcc": -0.001497194770721912, "macro_f1": 0.3575529761534889}]}, "total": {"test_mcc": 0.6366918461110025, "test_mcc_se": 1.007013849917279, "test_macro_f1": 45.59868523859758, "test_macro_f1_se": 3.318151468525828}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 32768, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.10.0"}
|
5 |
+
{"dataset": "norquad", "task": "question-answering", "dataset_languages": ["nb", "nn", "no"], "model": "mimir-project/mimir-mistral-7b-extended-scratch", "results": {"raw": {"test": [{"em": 29.280397022332505, "f1": 48.922874089972105}, {"em": 25.56390977443609, "f1": 44.07250684667331}, {"em": 39.849624060150376, "f1": 59.13358175154649}, {"em": 31.182795698924732, "f1": 54.64236832065543}, {"em": 30.40262941659819, "f1": 48.555759415232664}, {"em": 34.07590759075907, "f1": 52.27156415449808}, {"em": 33.05853256389118, "f1": 53.99846234914738}, {"em": 39.13405495420483, "f1": 58.2698418025881}, {"em": 26.733500417710943, "f1": 45.498253322717396}, {"em": 25.711892797319933, "f1": 42.56467122575274}]}, "total": {"test_em": 31.499324429632782, "test_em_se": 3.165463611121523, "test_f1": 50.79298832787837, "test_f1_se": 3.5912814350285474}}, "num_model_parameters": -1, "max_sequence_length": 2077, "vocabulary_size": 32768, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.10.0"}
|
6 |
+
{"dataset": "no-sammendrag", "task": "summarization", "dataset_languages": ["nb", "nn", "no"], "model": "mimir-project/mimir-mistral-7b-extended-scratch", "results": {"raw": {"test": [{"bertscore": 0.5658930748177227, "rouge_l": 0.1002463127364778}, {"bertscore": 0.5117585628322558, "rouge_l": 0.06016067943273051}, {"bertscore": 0.5433712695521535, "rouge_l": 0.07635404143513833}, {"bertscore": 0.5512683616398135, "rouge_l": 0.0768800615238155}, {"bertscore": 0.5575879507814534, "rouge_l": 0.0904165182829159}, {"bertscore": 0.5839109072403517, "rouge_l": 0.10090912762856939}, {"bertscore": 0.5690900277913897, "rouge_l": 0.09635492621205524}, {"bertscore": 0.586410643576528, "rouge_l": 0.10678216389270284}, {"bertscore": 0.5819870717823505, "rouge_l": 0.10440330933323319}, {"bertscore": 0.5878758266044315, "rouge_l": 0.08490106068392739}]}, "total": {"test_bertscore": 56.39153696618451, "test_bertscore_se": 1.4869058273858868, "test_rouge_l": 8.97408201161566, "test_rouge_l_se": 0.9303311527008332}}, "num_model_parameters": -1, "max_sequence_length": 2301, "vocabulary_size": 32768, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.10.0"}
|
7 |
+
{"dataset": "mmlu-no", "task": "knowledge", "dataset_languages": ["nb", "nn", "no"], "model": "mimir-project/mimir-mistral-7b-extended-scratch", "results": {"raw": {"test": [{"mcc": 0.019538805911349717, "accuracy": 0.25341796875}, {"mcc": 0.01872482878242972, "accuracy": 0.267578125}, {"mcc": 0.0026130506010096323, "accuracy": 0.2451171875}, {"mcc": 0.012387559587292665, "accuracy": 0.255859375}, {"mcc": -0.014028695869900193, "accuracy": 0.24365234375}, {"mcc": 0.018042380038519584, "accuracy": 0.2587890625}, {"mcc": 0.031537703404007106, "accuracy": 0.267578125}, {"mcc": 0.0031934477988807934, "accuracy": 0.24755859375}, {"mcc": 0.007374532755947053, "accuracy": 0.2490234375}, {"mcc": 0.0242791633829266, "accuracy": 0.2646484375}]}, "total": {"test_mcc": 1.2366277639246268, "test_mcc_se": 0.8092987777637096, "test_accuracy": 25.5322265625, "test_accuracy_se": 0.5633854017704093}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 32768, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.10.0"}
|
8 |
+
{"dataset": "hellaswag-no", "task": "common-sense-reasoning", "dataset_languages": ["nb", "nn", "no"], "model": "mimir-project/mimir-mistral-7b-extended-scratch", "results": {"raw": {"test": [{"accuracy": 0.23486328125, "mcc": -0.026653830741364553}, {"accuracy": 0.2509765625, "mcc": -0.013879242307161891}, {"accuracy": 0.27685546875, "mcc": 0.0005907763337576108}, {"accuracy": 0.259765625, "mcc": 0.016666306450697478}, {"accuracy": 0.27099609375, "mcc": 0.03648492021235834}, {"accuracy": 0.2548828125, "mcc": -3.347681329678987e-05}, {"accuracy": 0.2431640625, "mcc": -0.028947858997610033}, {"accuracy": 0.2548828125, "mcc": -0.024584846935066084}, {"accuracy": 0.23681640625, "mcc": 0.0037564881846129283}, {"accuracy": 0.24560546875, "mcc": -0.015103225058292518}]}, "total": {"test_accuracy": 25.2880859375, "test_accuracy_se": 0.8498439894972759, "test_mcc": -0.5170398967136549, "test_mcc_se": 1.2951181036732127}}, "num_model_parameters": -1, "max_sequence_length": 2048, "vocabulary_size": 32768, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.10.0"}
|
9 |
+
{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "mimir-project/mimir-mistral-7b-extended-scratch", "results": {"raw": {"test": [{"test_speed": 1343.2, "test_speed_short": 164.56}, {"test_speed": 2400.58, "test_speed_short": 296.8}, {"test_speed": 2760.8, "test_speed_short": 559.74}, {"test_speed": 3551.2200000000003, "test_speed_short": 689.49}, {"test_speed": 3900.76, "test_speed_short": 817.04}, {"test_speed": 4157.14, "test_speed_short": 1055.24}, {"test_speed": 4758.96, "test_speed_short": 1215.1200000000001}, {"test_speed": 4887.94, "test_speed_short": 1343.2}, {"test_speed": 4904.4800000000005, "test_speed_short": 1466.52}, {"test_speed": 4771.58, "test_speed_short": 1596.1}]}, "total": {"test_speed": 3743.6659999999997, "test_speed_se": 759.3604986515012, "test_speed_short": 920.3810000000001, "test_speed_short_se": 305.94341604245676}}, "num_model_parameters": -1, "max_sequence_length": 2046, "vocabulary_size": 32768, "generative": true, "few_shot": true, "validation_split": false, "scandeval_version": "12.10.0"}
|