diff --git a/results/cross_lingual/few_shot/cross_logiqa.csv b/results/cross_lingual/few_shot/cross_logiqa.csv new file mode 100644 index 0000000000000000000000000000000000000000..be33b49f018e61bffb67aba31e9dbe3c22c4e3f2 --- /dev/null +++ b/results/cross_lingual/few_shot/cross_logiqa.csv @@ -0,0 +1,2 @@ +Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Meta-Llama-3-8B,0.44967532467532456,0.2623376623376623,0.33136129711503204,0.5227272727272727,0.4431818181818182,0.44886363636363635,0.44886363636363635,0.3693181818181818,0.4602272727272727,0.45454545454545453 diff --git a/results/cross_lingual/few_shot/cross_mmlu.csv b/results/cross_lingual/few_shot/cross_mmlu.csv new file mode 100644 index 0000000000000000000000000000000000000000..523b1a9f90c8f2daba0c682e477f044e0522d413 --- /dev/null +++ b/results/cross_lingual/few_shot/cross_mmlu.csv @@ -0,0 +1,2 @@ +Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Meta-Llama-3-8B,0.5295238095238096,0.31923809523809527,0.3983311959862401,0.6266666666666667,0.5466666666666666,0.56,0.4866666666666667,0.5266666666666666,0.5,0.46 diff --git a/results/cross_lingual/few_shot/cross_xquad.csv b/results/cross_lingual/few_shot/cross_xquad.csv new file mode 100644 index 0000000000000000000000000000000000000000..47f23ab1b3c8b57b62758f43972cbaf9c94bdad4 --- /dev/null +++ b/results/cross_lingual/few_shot/cross_xquad.csv @@ -0,0 +1,3 @@ +Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Meta-Llama-3-70B,0.9596638655462185,0.9359243697478992,0.9476454662047799,0.9697478991596639,0.9504201680672268,0.957983193277311,0.9605042016806723,,, +Meta-Llama-3-8B,0.8928571428571429,0.8163865546218487,0.8529112234365448,0.926890756302521,0.8823529411764706,0.888235294117647,0.8739495798319328,,, diff --git a/results/cross_lingual/zero_shot/cross_logiqa.csv b/results/cross_lingual/zero_shot/cross_logiqa.csv new file mode 100644 index 0000000000000000000000000000000000000000..97abcd73442dc9bfd8da75460aefacdfd6354be5 --- /dev/null +++ b/results/cross_lingual/zero_shot/cross_logiqa.csv @@ -0,0 +1,5 @@ +Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Qwen2-7B-Instruct,0.5673701298701299,0.477922077922078,0.5188189663543613,0.6590909090909091,0.6704545454545454,0.5340909090909091,0.5625,0.5340909090909091,0.5397727272727273,0.4715909090909091 +Qwen2-72B-Instruct,0.6753246753246753,0.6814935064935067,0.6783950674333673,0.75,0.8125,0.6647727272727273,0.6136363636363636,0.6420454545454546,0.6590909090909091,0.5852272727272727 +Meta-Llama-3-8B-Instruct,0.4115259740259741,0.34042207792207796,0.3726122484532397,0.48863636363636365,0.4659090909090909,0.42613636363636365,0.4034090909090909,0.4034090909090909,0.36363636363636365,0.32954545454545453 +Meta-Llama-3-70B-Instruct,0.6290584415584416,0.6181818181818182,0.6235727047409828,0.6988636363636364,0.6875,0.6420454545454546,0.6193181818181818,0.6022727272727273,0.6136363636363636,0.5397727272727273 diff --git a/results/cross_lingual/zero_shot/cross_mmlu.csv b/results/cross_lingual/zero_shot/cross_mmlu.csv new file mode 100644 index 0000000000000000000000000000000000000000..2efe436caa86f564a5ac88181441d7495afaa3ec --- /dev/null +++ b/results/cross_lingual/zero_shot/cross_mmlu.csv @@ -0,0 +1,5 @@ +Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Qwen2-7B-Instruct,0.6495238095238095,0.529714285714286,0.5835327779462245,0.74,0.6733333333333333,0.7,0.6,0.6533333333333333,0.6333333333333333,0.5466666666666666 +Qwen2-72B-Instruct,0.7714285714285715,0.7765714285714286,0.773991456997936,0.8,0.78,0.7866666666666666,0.7333333333333333,0.76,0.78,0.76 +Meta-Llama-3-8B-Instruct,0.5276190476190475,0.3792380952380953,0.4412894449458876,0.62,0.5066666666666667,0.5066666666666667,0.5466666666666666,0.49333333333333335,0.52,0.5 +Meta-Llama-3-70B-Instruct,0.7542857142857143,0.7228571428571428,0.7382370820168919,0.7933333333333333,0.74,0.7666666666666667,0.7466666666666667,0.7666666666666667,0.72,0.7466666666666667 diff --git a/results/cross_lingual/zero_shot/cross_xquad.csv b/results/cross_lingual/zero_shot/cross_xquad.csv new file mode 100644 index 0000000000000000000000000000000000000000..9469a0d301f598d405b7af680360e3b0f4ef85f2 --- /dev/null +++ b/results/cross_lingual/zero_shot/cross_xquad.csv @@ -0,0 +1,5 @@ +Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Qwen2-7B-Instruct,0.940546218487395,0.9016806722689076,0.9207034712119446,0.9521008403361344,0.9352941176470588,0.9445378151260504,0.9302521008403362,,, +Qwen2-72B-Instruct,0.9611344537815126,0.9506302521008403,0.9558534951942531,0.9638655462184874,0.9554621848739496,0.9613445378151261,0.9638655462184874,,, +Meta-Llama-3-8B-Instruct,0.8756302521008403,0.7699579831932772,0.8194012188828194,0.8815126050420168,0.8420168067226891,0.9092436974789916,0.8697478991596639,,, +Meta-Llama-3-70B-Instruct,0.9586134453781513,0.9434873949579832,0.9509902767764395,0.9705882352941176,0.9394957983193277,0.9596638655462185,0.9647058823529412,,, diff --git a/results/cultural_reasoning/few_shot/cn_eval.csv b/results/cultural_reasoning/few_shot/cn_eval.csv new file mode 100644 index 0000000000000000000000000000000000000000..7f2af95b8bcc95e96f349a49f75ac662d828e585 --- /dev/null +++ b/results/cultural_reasoning/few_shot/cn_eval.csv @@ -0,0 +1,2 @@ +Model,Accuracy +Meta-Llama-3-8B,0.41904761904761906 diff --git a/results/cultural_reasoning/few_shot/ph_eval.csv b/results/cultural_reasoning/few_shot/ph_eval.csv new file mode 100644 index 0000000000000000000000000000000000000000..d9935ad2b8bc6a05dbb2462797fed42c4b22fe35 --- /dev/null +++ b/results/cultural_reasoning/few_shot/ph_eval.csv @@ -0,0 +1,2 @@ +Model,Accuracy +Meta-Llama-3-8B,0.54 diff --git a/results/cultural_reasoning/few_shot/sg_eval.csv b/results/cultural_reasoning/few_shot/sg_eval.csv new file mode 100644 index 0000000000000000000000000000000000000000..05e2fd4bcd810817496f13cf6e42900a5154661d --- /dev/null +++ b/results/cultural_reasoning/few_shot/sg_eval.csv @@ -0,0 +1,3 @@ +Model,Accuracy +Meta-Llama-3-70B,0.7572815533980582 +Meta-Llama-3-8B,0.6407766990291263 diff --git a/results/cultural_reasoning/few_shot/us_eval.csv b/results/cultural_reasoning/few_shot/us_eval.csv new file mode 100644 index 0000000000000000000000000000000000000000..a464de78937fcb4cab139b625cd1e1ab52450078 --- /dev/null +++ b/results/cultural_reasoning/few_shot/us_eval.csv @@ -0,0 +1,2 @@ +Model,Accuracy +Meta-Llama-3-8B,0.6915887850467289 diff --git a/results/cultural_reasoning/zero_shot/cn_eval.csv b/results/cultural_reasoning/zero_shot/cn_eval.csv new file mode 100644 index 0000000000000000000000000000000000000000..23d5ebc3574ee9033938c0436bdf4ccd6db917b1 --- /dev/null +++ b/results/cultural_reasoning/zero_shot/cn_eval.csv @@ -0,0 +1,5 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.8095238095238095 +Qwen2-72B-Instruct,0.8571428571428571 +Meta-Llama-3-8B-Instruct,0.37142857142857144 +Meta-Llama-3-70B-Instruct,0.5142857142857142 diff --git a/results/cultural_reasoning/zero_shot/ph_eval.csv b/results/cultural_reasoning/zero_shot/ph_eval.csv new file mode 100644 index 0000000000000000000000000000000000000000..4b64dcef17e18ce8428819ba569460a5df1ebe58 --- /dev/null +++ b/results/cultural_reasoning/zero_shot/ph_eval.csv @@ -0,0 +1,5 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.51 +Qwen2-72B-Instruct,0.63 +Meta-Llama-3-8B-Instruct,0.54 +Meta-Llama-3-70B-Instruct,0.63 diff --git a/results/cultural_reasoning/zero_shot/sg_eval.csv b/results/cultural_reasoning/zero_shot/sg_eval.csv new file mode 100644 index 0000000000000000000000000000000000000000..07e4c7a974256a9b60b0dd038ba0e49117ea212a --- /dev/null +++ b/results/cultural_reasoning/zero_shot/sg_eval.csv @@ -0,0 +1,5 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.6699029126213593 +Qwen2-72B-Instruct,0.7378640776699029 +Meta-Llama-3-8B-Instruct,0.5922330097087378 +Meta-Llama-3-70B-Instruct,0.7184466019417476 diff --git a/results/cultural_reasoning/zero_shot/us_eval.csv b/results/cultural_reasoning/zero_shot/us_eval.csv new file mode 100644 index 0000000000000000000000000000000000000000..86051361b9c2259b907dd153290f13f52503b1ef --- /dev/null +++ b/results/cultural_reasoning/zero_shot/us_eval.csv @@ -0,0 +1,5 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.719626168224299 +Qwen2-72B-Instruct,0.8504672897196262 +Meta-Llama-3-8B-Instruct,0.6448598130841121 +Meta-Llama-3-70B-Instruct,0.8691588785046729 diff --git a/results/dialogue/few_shot/dialogsum.csv b/results/dialogue/few_shot/dialogsum.csv new file mode 100644 index 0000000000000000000000000000000000000000..432ea7efdaa7a932511e6a0f0b394bfd4ea17963 --- /dev/null +++ b/results/dialogue/few_shot/dialogsum.csv @@ -0,0 +1 @@ +Model,Average,ROUGE-1,ROUGE-2,ROUGE-L diff --git a/results/dialogue/few_shot/dream.csv b/results/dialogue/few_shot/dream.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/dialogue/few_shot/dream.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/dialogue/few_shot/samsum.csv b/results/dialogue/few_shot/samsum.csv new file mode 100644 index 0000000000000000000000000000000000000000..432ea7efdaa7a932511e6a0f0b394bfd4ea17963 --- /dev/null +++ b/results/dialogue/few_shot/samsum.csv @@ -0,0 +1 @@ +Model,Average,ROUGE-1,ROUGE-2,ROUGE-L diff --git a/results/dialogue/zero_shot/dialogsum.csv b/results/dialogue/zero_shot/dialogsum.csv new file mode 100644 index 0000000000000000000000000000000000000000..060f6a61affdc541783817639f2ad160cb26cf1b --- /dev/null +++ b/results/dialogue/zero_shot/dialogsum.csv @@ -0,0 +1,4 @@ +Model,Average,ROUGE-1,ROUGE-2,ROUGE-L +Qwen2-7B-Instruct,0.20907406151501814,0.3054588156947843,0.09317750879187732,0.22858586005839285 +Meta-Llama-3-8B-Instruct,0.23748034560689027,0.33656243928704743,0.11826169056076426,0.2576169069728591 +Meta-Llama-3-70B-Instruct,0.2557065499979308,0.36058417323628,0.12758087337786866,0.2789546033796438 diff --git a/results/dialogue/zero_shot/dream.csv b/results/dialogue/zero_shot/dream.csv new file mode 100644 index 0000000000000000000000000000000000000000..c12fed5b208f662dcd60e50cc34d6ebaacca2157 --- /dev/null +++ b/results/dialogue/zero_shot/dream.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.9338559529642332 +Meta-Llama-3-8B-Instruct,0.5433610975012249 +Meta-Llama-3-70B-Instruct,0.9480646741793238 diff --git a/results/dialogue/zero_shot/samsum.csv b/results/dialogue/zero_shot/samsum.csv new file mode 100644 index 0000000000000000000000000000000000000000..c6f8f401d1e68d13ca64d56281202573a0378bc4 --- /dev/null +++ b/results/dialogue/zero_shot/samsum.csv @@ -0,0 +1,4 @@ +Model,Average,ROUGE-1,ROUGE-2,ROUGE-L +Qwen2-7B-Instruct,0.2609036529701212,0.36802926348230236,0.1319027531874975,0.28277894224056366 +Meta-Llama-3-8B-Instruct,0.2850232460296334,0.3945214081577773,0.15619034353394273,0.3043579863971803 +Meta-Llama-3-70B-Instruct,0.2893525314227379,0.4030746211134018,0.15236139065578,0.3126215824990321 diff --git a/results/emotion/few_shot/ind_emotion.csv b/results/emotion/few_shot/ind_emotion.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/emotion/few_shot/ind_emotion.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/emotion/few_shot/sst2.csv b/results/emotion/few_shot/sst2.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/emotion/few_shot/sst2.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/emotion/zero_shot/ind_emotion.csv b/results/emotion/zero_shot/ind_emotion.csv new file mode 100644 index 0000000000000000000000000000000000000000..704023cc83184a72a07a429b3cc789bf30c96764 --- /dev/null +++ b/results/emotion/zero_shot/ind_emotion.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.6386363636363637 +Meta-Llama-3-8B-Instruct,0.6522727272727272 +Meta-Llama-3-70B-Instruct,0.6909090909090909 diff --git a/results/emotion/zero_shot/sst2.csv b/results/emotion/zero_shot/sst2.csv new file mode 100644 index 0000000000000000000000000000000000000000..5c9a67d7e01c440fb9f21fe79282683a2b4c7ab0 --- /dev/null +++ b/results/emotion/zero_shot/sst2.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.9231651376146789 +Meta-Llama-3-8B-Instruct,0.8669724770642202 +Meta-Llama-3-70B-Instruct,0.9495412844036697 diff --git a/results/flores_translation/few_shot/ind2eng.csv b/results/flores_translation/few_shot/ind2eng.csv new file mode 100644 index 0000000000000000000000000000000000000000..7f534e8cdc32663e57e36872fe0572ac0a3a2411 --- /dev/null +++ b/results/flores_translation/few_shot/ind2eng.csv @@ -0,0 +1,2 @@ +Model,BLEU +Meta-Llama-3-8B,0.37760317005449096 diff --git a/results/flores_translation/few_shot/vie2eng.csv b/results/flores_translation/few_shot/vie2eng.csv new file mode 100644 index 0000000000000000000000000000000000000000..56764548e5126ccb0bc1ae02ee1ec8aecd0546ac --- /dev/null +++ b/results/flores_translation/few_shot/vie2eng.csv @@ -0,0 +1,2 @@ +Model,BLEU +Meta-Llama-3-8B,0.31157996445764863 diff --git a/results/flores_translation/few_shot/zho2eng.csv b/results/flores_translation/few_shot/zho2eng.csv new file mode 100644 index 0000000000000000000000000000000000000000..ca98b0289c01e5d0e609d239b20ed788216c429f --- /dev/null +++ b/results/flores_translation/few_shot/zho2eng.csv @@ -0,0 +1,2 @@ +Model,BLEU +Meta-Llama-3-8B,0.23710858530408072 diff --git a/results/flores_translation/few_shot/zsm2eng.csv b/results/flores_translation/few_shot/zsm2eng.csv new file mode 100644 index 0000000000000000000000000000000000000000..b560efa78538383d03231ff2577b702a9eb9c3f8 --- /dev/null +++ b/results/flores_translation/few_shot/zsm2eng.csv @@ -0,0 +1,2 @@ +Model,BLEU +Meta-Llama-3-8B,0.3908770132718593 diff --git a/results/flores_translation/zero_shot/ind2eng.csv b/results/flores_translation/zero_shot/ind2eng.csv new file mode 100644 index 0000000000000000000000000000000000000000..de375b52ab4c145693acec0d7ed14d1b219facd0 --- /dev/null +++ b/results/flores_translation/zero_shot/ind2eng.csv @@ -0,0 +1,4 @@ +Model,BLEU +Qwen2-7B-Instruct,0.2968667083646938 +Meta-Llama-3-8B-Instruct,0.33011728860318257 +Meta-Llama-3-70B-Instruct,0.3830092775167675 diff --git a/results/flores_translation/zero_shot/vie2eng.csv b/results/flores_translation/zero_shot/vie2eng.csv new file mode 100644 index 0000000000000000000000000000000000000000..ffc10e5cbab87e1ffe9a28ead5340d2249b5a667 --- /dev/null +++ b/results/flores_translation/zero_shot/vie2eng.csv @@ -0,0 +1,4 @@ +Model,BLEU +Qwen2-7B-Instruct,0.23571859325121644 +Meta-Llama-3-8B-Instruct,0.2637063711923046 +Meta-Llama-3-70B-Instruct,0.3230140263371192 diff --git a/results/flores_translation/zero_shot/zho2eng.csv b/results/flores_translation/zero_shot/zho2eng.csv new file mode 100644 index 0000000000000000000000000000000000000000..8e2fe04f143cbbaad6c5212a6dc3097fc3f39739 --- /dev/null +++ b/results/flores_translation/zero_shot/zho2eng.csv @@ -0,0 +1,4 @@ +Model,BLEU +Qwen2-7B-Instruct,0.21747115262398484 +Meta-Llama-3-8B-Instruct,0.19960072119079214 +Meta-Llama-3-70B-Instruct,0.24397819518058994 diff --git a/results/flores_translation/zero_shot/zsm2eng.csv b/results/flores_translation/zero_shot/zsm2eng.csv new file mode 100644 index 0000000000000000000000000000000000000000..d8e006fdb7ae46f7cfd3aa8a1f8cf8264589703b --- /dev/null +++ b/results/flores_translation/zero_shot/zsm2eng.csv @@ -0,0 +1,4 @@ +Model,BLEU +Qwen2-7B-Instruct,0.27198336767927184 +Meta-Llama-3-8B-Instruct,0.31536374302282033 +Meta-Llama-3-70B-Instruct,0.3957287030176054 diff --git a/results/fundamental_nlp_tasks/few_shot/c3.csv b/results/fundamental_nlp_tasks/few_shot/c3.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/fundamental_nlp_tasks/few_shot/c3.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/fundamental_nlp_tasks/few_shot/cola.csv b/results/fundamental_nlp_tasks/few_shot/cola.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/fundamental_nlp_tasks/few_shot/cola.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/fundamental_nlp_tasks/few_shot/mnli.csv b/results/fundamental_nlp_tasks/few_shot/mnli.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/fundamental_nlp_tasks/few_shot/mnli.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/fundamental_nlp_tasks/few_shot/mrpc.csv b/results/fundamental_nlp_tasks/few_shot/mrpc.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/fundamental_nlp_tasks/few_shot/mrpc.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/fundamental_nlp_tasks/few_shot/ocnli.csv b/results/fundamental_nlp_tasks/few_shot/ocnli.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/fundamental_nlp_tasks/few_shot/ocnli.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/fundamental_nlp_tasks/few_shot/qnli.csv b/results/fundamental_nlp_tasks/few_shot/qnli.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/fundamental_nlp_tasks/few_shot/qnli.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/fundamental_nlp_tasks/few_shot/qqp.csv b/results/fundamental_nlp_tasks/few_shot/qqp.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/fundamental_nlp_tasks/few_shot/qqp.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/fundamental_nlp_tasks/few_shot/rte.csv b/results/fundamental_nlp_tasks/few_shot/rte.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/fundamental_nlp_tasks/few_shot/rte.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/fundamental_nlp_tasks/few_shot/wnli.csv b/results/fundamental_nlp_tasks/few_shot/wnli.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/fundamental_nlp_tasks/few_shot/wnli.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/fundamental_nlp_tasks/zero_shot/c3.csv b/results/fundamental_nlp_tasks/zero_shot/c3.csv new file mode 100644 index 0000000000000000000000000000000000000000..be600fb4de242c35c44421e82f442425f3d40d9a --- /dev/null +++ b/results/fundamental_nlp_tasks/zero_shot/c3.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.9233358264771877 +Meta-Llama-3-8B-Instruct,0.8515332834704562 +Meta-Llama-3-70B-Instruct,0.9521316379955124 diff --git a/results/fundamental_nlp_tasks/zero_shot/cola.csv b/results/fundamental_nlp_tasks/zero_shot/cola.csv new file mode 100644 index 0000000000000000000000000000000000000000..093f0edfd88f87334431c3f1a063ada79b2c3f9f --- /dev/null +++ b/results/fundamental_nlp_tasks/zero_shot/cola.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.7861936720997124 +Meta-Llama-3-8B-Instruct,0.6481303930968361 +Meta-Llama-3-70B-Instruct,0.835091083413231 diff --git a/results/fundamental_nlp_tasks/zero_shot/mnli.csv b/results/fundamental_nlp_tasks/zero_shot/mnli.csv new file mode 100644 index 0000000000000000000000000000000000000000..55c4a1d8153d5674ee360384e5a623ff02b8ad5e --- /dev/null +++ b/results/fundamental_nlp_tasks/zero_shot/mnli.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.7341578867002596 +Meta-Llama-3-8B-Instruct,0.5296991907161399 +Meta-Llama-3-70B-Instruct,0.6709421285692472 diff --git a/results/fundamental_nlp_tasks/zero_shot/mrpc.csv b/results/fundamental_nlp_tasks/zero_shot/mrpc.csv new file mode 100644 index 0000000000000000000000000000000000000000..bf561efd8a4b1d91de2f4b4fce3bea9f3bbc422f --- /dev/null +++ b/results/fundamental_nlp_tasks/zero_shot/mrpc.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.7745098039215687 +Meta-Llama-3-8B-Instruct,0.6764705882352942 +Meta-Llama-3-70B-Instruct,0.7598039215686274 diff --git a/results/fundamental_nlp_tasks/zero_shot/ocnli.csv b/results/fundamental_nlp_tasks/zero_shot/ocnli.csv new file mode 100644 index 0000000000000000000000000000000000000000..faa8c4ea976d028167f54f1c6f80f6733c9526a9 --- /dev/null +++ b/results/fundamental_nlp_tasks/zero_shot/ocnli.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.6474576271186441 +Meta-Llama-3-8B-Instruct,0.4322033898305085 +Meta-Llama-3-70B-Instruct,0.5928813559322034 diff --git a/results/fundamental_nlp_tasks/zero_shot/qnli.csv b/results/fundamental_nlp_tasks/zero_shot/qnli.csv new file mode 100644 index 0000000000000000000000000000000000000000..b7dfb5988db2ba12269801a8790fea6432973961 --- /dev/null +++ b/results/fundamental_nlp_tasks/zero_shot/qnli.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.8169503935566539 +Meta-Llama-3-8B-Instruct,0.5689181768259198 +Meta-Llama-3-70B-Instruct,0.876807614863628 diff --git a/results/fundamental_nlp_tasks/zero_shot/qqp.csv b/results/fundamental_nlp_tasks/zero_shot/qqp.csv new file mode 100644 index 0000000000000000000000000000000000000000..bbf0ac307ff93e929802c21e18e1ae13a08c2f36 --- /dev/null +++ b/results/fundamental_nlp_tasks/zero_shot/qqp.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.7771209497897601 +Meta-Llama-3-8B-Instruct,0.5512490724709375 +Meta-Llama-3-70B-Instruct,0.7876082117239673 diff --git a/results/fundamental_nlp_tasks/zero_shot/rte.csv b/results/fundamental_nlp_tasks/zero_shot/rte.csv new file mode 100644 index 0000000000000000000000000000000000000000..b247ab7191c17cca09857286e2712222ee17e653 --- /dev/null +++ b/results/fundamental_nlp_tasks/zero_shot/rte.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.8411552346570397 +Meta-Llama-3-8B-Instruct,0.6028880866425993 +Meta-Llama-3-70B-Instruct,0.8086642599277978 diff --git a/results/fundamental_nlp_tasks/zero_shot/wnli.csv b/results/fundamental_nlp_tasks/zero_shot/wnli.csv new file mode 100644 index 0000000000000000000000000000000000000000..879b9f67d19fd37dc515629a8e7e47d8d750742d --- /dev/null +++ b/results/fundamental_nlp_tasks/zero_shot/wnli.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.647887323943662 +Meta-Llama-3-8B-Instruct,0.4507042253521127 +Meta-Llama-3-70B-Instruct,0.7887323943661971 diff --git a/results/general_reasoning/few_shot/c_eval.csv b/results/general_reasoning/few_shot/c_eval.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/general_reasoning/few_shot/c_eval.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/general_reasoning/few_shot/cmmlu.csv b/results/general_reasoning/few_shot/cmmlu.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/general_reasoning/few_shot/cmmlu.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/general_reasoning/few_shot/indommlu.csv b/results/general_reasoning/few_shot/indommlu.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/general_reasoning/few_shot/indommlu.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/general_reasoning/few_shot/mmlu.csv b/results/general_reasoning/few_shot/mmlu.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/general_reasoning/few_shot/mmlu.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/general_reasoning/few_shot/zbench.csv b/results/general_reasoning/few_shot/zbench.csv new file mode 100644 index 0000000000000000000000000000000000000000..1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab --- /dev/null +++ b/results/general_reasoning/few_shot/zbench.csv @@ -0,0 +1 @@ +Model,Accuracy diff --git a/results/general_reasoning/zero_shot/c_eval.csv b/results/general_reasoning/zero_shot/c_eval.csv new file mode 100644 index 0000000000000000000000000000000000000000..d08930fb8dbd3349041f76ccde23044c8304b3c6 --- /dev/null +++ b/results/general_reasoning/zero_shot/c_eval.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.7546699875466999 +Meta-Llama-3-8B-Instruct,0.4533001245330012 +Meta-Llama-3-70B-Instruct,0.6046077210460772 diff --git a/results/general_reasoning/zero_shot/cmmlu.csv b/results/general_reasoning/zero_shot/cmmlu.csv new file mode 100644 index 0000000000000000000000000000000000000000..afbe25d4d83259f4b2b002e37673ed21711abeeb --- /dev/null +++ b/results/general_reasoning/zero_shot/cmmlu.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.7656708685891901 +Meta-Llama-3-8B-Instruct,0.4679675358314626 +Meta-Llama-3-70B-Instruct,0.6195821101709549 diff --git a/results/general_reasoning/zero_shot/indommlu.csv b/results/general_reasoning/zero_shot/indommlu.csv new file mode 100644 index 0000000000000000000000000000000000000000..2b9d296121c9db1fd5c9d64a045dd639e6a471de --- /dev/null +++ b/results/general_reasoning/zero_shot/indommlu.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.53027571934041 +Meta-Llama-3-8B-Instruct,0.5115161225716003 +Meta-Llama-3-70B-Instruct,0.6323519594098405 diff --git a/results/general_reasoning/zero_shot/mmlu.csv b/results/general_reasoning/zero_shot/mmlu.csv new file mode 100644 index 0000000000000000000000000000000000000000..c9624f74c0176cad41871491016275e5ba2035e4 --- /dev/null +++ b/results/general_reasoning/zero_shot/mmlu.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.6654272434751519 +Meta-Llama-3-8B-Instruct,0.508044333214158 +Meta-Llama-3-70B-Instruct,0.7607436539149088 diff --git a/results/general_reasoning/zero_shot/zbench.csv b/results/general_reasoning/zero_shot/zbench.csv new file mode 100644 index 0000000000000000000000000000000000000000..1e0718ab1a40555aea72e40648613d13d9ce1360 --- /dev/null +++ b/results/general_reasoning/zero_shot/zbench.csv @@ -0,0 +1,4 @@ +Model,Accuracy +Qwen2-7B-Instruct,0.696969696969697 +Meta-Llama-3-8B-Instruct,0.30303030303030304 +Meta-Llama-3-70B-Instruct,0.45454545454545453