alielfilali01 commited on
Commit
9d8ec61
1 Parent(s): ed120b4

Update results.json with latest aggregated results.

Browse files
Files changed (1) hide show
  1. assets/results/results.json +753 -0
assets/results/results.json CHANGED
@@ -598,5 +598,758 @@
598
  "Failed Entries": 1,
599
  "Success Ratio": 0.9964
600
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
  }
602
  ]
 
598
  "Failed Entries": 1,
599
  "Success Ratio": 0.9964
600
  }
601
+ },
602
+ {
603
+ "claude-3.5-sonnet Scores": {
604
+ "3C3H Scores": {
605
+ "Correctness": 0.542,
606
+ "Completeness": 0.5156,
607
+ "Conciseness": 0.2512,
608
+ "Helpfulness": 0.5033,
609
+ "Honesty": 0.533,
610
+ "Harmlessness": 0.542,
611
+ "3C3H Score": 0.4812
612
+ },
613
+ "Tasks Scores": {
614
+ "Question Answering (QA)": 0.6009,
615
+ "Reasoning": 0.4825,
616
+ "Orthographic and Grammatical Analysis": 0.0309,
617
+ "Safety": 0.2583
618
+ }
619
+ },
620
+ "Meta": {
621
+ "Model Name": "CohereForAI/aya-23-35B",
622
+ "License": "cc-by-nc-4.0",
623
+ "Revision": "main",
624
+ "Precision": "float16",
625
+ "Params": 35.0,
626
+ "Total Entries": 279,
627
+ "Successful Entries": 278,
628
+ "Failed Entries": 1,
629
+ "Success Ratio": 0.9964
630
+ }
631
+ },
632
+ {
633
+ "claude-3.5-sonnet Scores": {
634
+ "3C3H Scores": {
635
+ "Correctness": 0.5878,
636
+ "Completeness": 0.5472,
637
+ "Conciseness": 0.1738,
638
+ "Helpfulness": 0.5594,
639
+ "Honesty": 0.5806,
640
+ "Harmlessness": 0.5833,
641
+ "3C3H Score": 0.5054
642
+ },
643
+ "Tasks Scores": {
644
+ "Question Answering (QA)": 0.6209,
645
+ "Reasoning": 0.5394,
646
+ "Orthographic and Grammatical Analysis": 0.0269,
647
+ "Safety": 0.2354
648
+ }
649
+ },
650
+ "Meta": {
651
+ "Model Name": "CohereForAI/c4ai-command-r-08-2024",
652
+ "License": "cc-by-nc-4.0",
653
+ "Revision": "main",
654
+ "Precision": "float16",
655
+ "Params": 32.0,
656
+ "Total Entries": 279,
657
+ "Successful Entries": 279,
658
+ "Failed Entries": 0,
659
+ "Success Ratio": 1.0
660
+ }
661
+ },
662
+ {
663
+ "claude-3.5-sonnet Scores": {
664
+ "3C3H Scores": {
665
+ "Correctness": 0.6282,
666
+ "Completeness": 0.6221,
667
+ "Conciseness": 0.1733,
668
+ "Helpfulness": 0.5978,
669
+ "Honesty": 0.6119,
670
+ "Harmlessness": 0.6282,
671
+ "3C3H Score": 0.5436
672
+ },
673
+ "Tasks Scores": {
674
+ "Question Answering (QA)": 0.6891,
675
+ "Reasoning": 0.5333,
676
+ "Orthographic and Grammatical Analysis": 0.0264,
677
+ "Safety": 0.2521
678
+ }
679
+ },
680
+ "Meta": {
681
+ "Model Name": "CohereForAI/c4ai-command-r-v01",
682
+ "License": "cc-by-nc-4.0",
683
+ "Revision": "main",
684
+ "Precision": "float16",
685
+ "Params": 35.0,
686
+ "Total Entries": 279,
687
+ "Successful Entries": 277,
688
+ "Failed Entries": 2,
689
+ "Success Ratio": 0.9928
690
+ }
691
+ },
692
+ {
693
+ "claude-3.5-sonnet Scores": {
694
+ "3C3H Scores": {
695
+ "Correctness": 0.5297,
696
+ "Completeness": 0.4679,
697
+ "Conciseness": 0.2876,
698
+ "Helpfulness": 0.4694,
699
+ "Honesty": 0.5097,
700
+ "Harmlessness": 0.5297,
701
+ "3C3H Score": 0.4657
702
+ },
703
+ "Tasks Scores": {
704
+ "Question Answering (QA)": 0.5958,
705
+ "Reasoning": 0.4296,
706
+ "Orthographic and Grammatical Analysis": 0.0,
707
+ "Safety": 0.3171
708
+ }
709
+ },
710
+ "Meta": {
711
+ "Model Name": "FreedomIntelligence/AceGPT-v1.5-13B-Chat",
712
+ "License": "apache-2.0",
713
+ "Revision": "main",
714
+ "Precision": "float32",
715
+ "Params": 13.0,
716
+ "Total Entries": 279,
717
+ "Successful Entries": 275,
718
+ "Failed Entries": 4,
719
+ "Success Ratio": 0.9857
720
+ }
721
+ },
722
+ {
723
+ "claude-3.5-sonnet Scores": {
724
+ "3C3H Scores": {
725
+ "Correctness": 0.6717,
726
+ "Completeness": 0.6642,
727
+ "Conciseness": 0.2906,
728
+ "Helpfulness": 0.6479,
729
+ "Honesty": 0.6657,
730
+ "Harmlessness": 0.6717,
731
+ "3C3H Score": 0.602
732
+ },
733
+ "Tasks Scores": {
734
+ "Question Answering (QA)": 0.7136,
735
+ "Reasoning": 0.5694,
736
+ "Orthographic and Grammatical Analysis": 0.0632,
737
+ "Safety": 0.75
738
+ }
739
+ },
740
+ "Meta": {
741
+ "Model Name": "FreedomIntelligence/AceGPT-v2-70B-Chat",
742
+ "License": "apache-2.0",
743
+ "Revision": "main",
744
+ "Precision": "float16",
745
+ "Params": 70.0,
746
+ "Total Entries": 279,
747
+ "Successful Entries": 267,
748
+ "Failed Entries": 12,
749
+ "Success Ratio": 0.957
750
+ }
751
+ },
752
+ {
753
+ "claude-3.5-sonnet Scores": {
754
+ "3C3H Scores": {
755
+ "Correctness": 0.7103,
756
+ "Completeness": 0.7091,
757
+ "Conciseness": 0.1912,
758
+ "Helpfulness": 0.6888,
759
+ "Honesty": 0.7036,
760
+ "Harmlessness": 0.7103,
761
+ "3C3H Score": 0.6189
762
+ },
763
+ "Tasks Scores": {
764
+ "Question Answering (QA)": 0.6862,
765
+ "Reasoning": 0.7472,
766
+ "Orthographic and Grammatical Analysis": 0.0282,
767
+ "Safety": 0.5482
768
+ }
769
+ },
770
+ "Meta": {
771
+ "Model Name": "MaziyarPanahi/calme-2.2-qwen2.5-72b",
772
+ "License": "tongyi-qianwen",
773
+ "Revision": "main",
774
+ "Precision": "bfloat16",
775
+ "Params": 72.0,
776
+ "Total Entries": 279,
777
+ "Successful Entries": 275,
778
+ "Failed Entries": 4,
779
+ "Success Ratio": 0.9857
780
+ }
781
+ },
782
+ {
783
+ "claude-3.5-sonnet Scores": {
784
+ "3C3H Scores": {
785
+ "Correctness": 0.2848,
786
+ "Completeness": 0.2848,
787
+ "Conciseness": 0.088,
788
+ "Helpfulness": 0.2553,
789
+ "Honesty": 0.2531,
790
+ "Harmlessness": 0.2833,
791
+ "3C3H Score": 0.2416
792
+ },
793
+ "Tasks Scores": {
794
+ "Question Answering (QA)": 0.2384,
795
+ "Reasoning": 0.2723,
796
+ "Orthographic and Grammatical Analysis": 0.0,
797
+ "Safety": 0.5486
798
+ }
799
+ },
800
+ "Meta": {
801
+ "Model Name": "Qwen/Qwen2.5-1.5B-Instruct",
802
+ "License": "qwen",
803
+ "Revision": "main",
804
+ "Precision": "bfloat16",
805
+ "Params": 1.443,
806
+ "Total Entries": 279,
807
+ "Successful Entries": 268,
808
+ "Failed Entries": 11,
809
+ "Success Ratio": 0.9606
810
+ }
811
+ },
812
+ {
813
+ "claude-3.5-sonnet Scores": {
814
+ "3C3H Scores": {
815
+ "Correctness": 0.6146,
816
+ "Completeness": 0.6059,
817
+ "Conciseness": 0.1859,
818
+ "Helpfulness": 0.5914,
819
+ "Honesty": 0.5988,
820
+ "Harmlessness": 0.6146,
821
+ "3C3H Score": 0.5352
822
+ },
823
+ "Tasks Scores": {
824
+ "Question Answering (QA)": 0.566,
825
+ "Reasoning": 0.6684,
826
+ "Orthographic and Grammatical Analysis": 0.0,
827
+ "Safety": 0.6009
828
+ }
829
+ },
830
+ "Meta": {
831
+ "Model Name": "Qwen/Qwen2.5-14B-Instruct",
832
+ "License": "apache-2.0",
833
+ "Revision": "main",
834
+ "Precision": "bfloat16",
835
+ "Params": 14.0,
836
+ "Total Entries": 279,
837
+ "Successful Entries": 269,
838
+ "Failed Entries": 10,
839
+ "Success Ratio": 0.9642
840
+ }
841
+ },
842
+ {
843
+ "claude-3.5-sonnet Scores": {
844
+ "3C3H Scores": {
845
+ "Correctness": 0.8831,
846
+ "Completeness": 0.8781,
847
+ "Conciseness": 0.3327,
848
+ "Helpfulness": 0.8697,
849
+ "Honesty": 0.8778,
850
+ "Harmlessness": 0.8831,
851
+ "3C3H Score": 0.7874
852
+ },
853
+ "Tasks Scores": {
854
+ "Question Answering (QA)": 0.7896,
855
+ "Reasoning": 0.77,
856
+ "Orthographic and Grammatical Analysis": 0.7487,
857
+ "Safety": 0.9013
858
+ }
859
+ },
860
+ "Meta": {
861
+ "Model Name": "claude-3-5-sonnet-20241022",
862
+ "License": "Proprietary",
863
+ "Revision": "UNK",
864
+ "Precision": "UNK",
865
+ "Params": "UNK",
866
+ "Total Entries": 279,
867
+ "Successful Entries": 268,
868
+ "Failed Entries": 11,
869
+ "Success Ratio": 0.9606
870
+ }
871
+ },
872
+ {
873
+ "claude-3.5-sonnet Scores": {
874
+ "3C3H Scores": {
875
+ "Correctness": 0.6389,
876
+ "Completeness": 0.6377,
877
+ "Conciseness": 0.1938,
878
+ "Helpfulness": 0.6162,
879
+ "Honesty": 0.6316,
880
+ "Harmlessness": 0.6389,
881
+ "3C3H Score": 0.5595
882
+ },
883
+ "Tasks Scores": {
884
+ "Question Answering (QA)": 0.6376,
885
+ "Reasoning": 0.5767,
886
+ "Orthographic and Grammatical Analysis": 0.0591,
887
+ "Safety": 0.6854
888
+ }
889
+ },
890
+ "Meta": {
891
+ "Model Name": "claude-3-haiku-20240307",
892
+ "License": "Proprietary",
893
+ "Revision": "UNK",
894
+ "Precision": "UNK",
895
+ "Params": "UNK",
896
+ "Total Entries": 279,
897
+ "Successful Entries": 276,
898
+ "Failed Entries": 3,
899
+ "Success Ratio": 0.9892
900
+ }
901
+ },
902
+ {
903
+ "claude-3.5-sonnet Scores": {
904
+ "3C3H Scores": {
905
+ "Correctness": 0.2603,
906
+ "Completeness": 0.2311,
907
+ "Conciseness": 0.0721,
908
+ "Helpfulness": 0.2132,
909
+ "Honesty": 0.2476,
910
+ "Harmlessness": 0.2594,
911
+ "3C3H Score": 0.214
912
+ },
913
+ "Tasks Scores": {
914
+ "Question Answering (QA)": 0.224,
915
+ "Reasoning": 0.2934,
916
+ "Orthographic and Grammatical Analysis": 0.0,
917
+ "Safety": 0.1771
918
+ }
919
+ },
920
+ "Meta": {
921
+ "Model Name": "meta-llama/Meta-Llama-3-70B-Instruct",
922
+ "License": "llama3",
923
+ "Revision": "main",
924
+ "Precision": "bfloat16",
925
+ "Params": 70.0,
926
+ "Total Entries": 279,
927
+ "Successful Entries": 274,
928
+ "Failed Entries": 5,
929
+ "Success Ratio": 0.9821
930
+ }
931
+ },
932
+ {
933
+ "claude-3.5-sonnet Scores": {
934
+ "3C3H Scores": {
935
+ "Correctness": 0.721,
936
+ "Completeness": 0.7138,
937
+ "Conciseness": 0.2298,
938
+ "Helpfulness": 0.7041,
939
+ "Honesty": 0.7141,
940
+ "Harmlessness": 0.721,
941
+ "3C3H Score": 0.634
942
+ },
943
+ "Tasks Scores": {
944
+ "Question Answering (QA)": 0.6923,
945
+ "Reasoning": 0.7312,
946
+ "Orthographic and Grammatical Analysis": 0.1909,
947
+ "Safety": 0.5229
948
+ }
949
+ },
950
+ "Meta": {
951
+ "Model Name": "gpt-4o-mini",
952
+ "License": "Proprietary",
953
+ "Revision": "UNK",
954
+ "Precision": "UNK",
955
+ "Params": "UNK",
956
+ "Total Entries": 279,
957
+ "Successful Entries": 276,
958
+ "Failed Entries": 3,
959
+ "Success Ratio": 0.9892
960
+ }
961
+ },
962
+ {
963
+ "claude-3.5-sonnet Scores": {
964
+ "3C3H Scores": {
965
+ "Correctness": 0.8375,
966
+ "Completeness": 0.8291,
967
+ "Conciseness": 0.2894,
968
+ "Helpfulness": 0.8099,
969
+ "Honesty": 0.83,
970
+ "Harmlessness": 0.8375,
971
+ "3C3H Score": 0.7389
972
+ },
973
+ "Tasks Scores": {
974
+ "Question Answering (QA)": 0.8014,
975
+ "Reasoning": 0.7455,
976
+ "Orthographic and Grammatical Analysis": 0.5027,
977
+ "Safety": 0.6063
978
+ }
979
+ },
980
+ "Meta": {
981
+ "Model Name": "gpt-4o",
982
+ "License": "Proprietary",
983
+ "Revision": "UNK",
984
+ "Precision": "UNK",
985
+ "Params": "UNK",
986
+ "Total Entries": 279,
987
+ "Successful Entries": 277,
988
+ "Failed Entries": 2,
989
+ "Success Ratio": 0.9928
990
+ }
991
+ },
992
+ {
993
+ "claude-3.5-sonnet Scores": {
994
+ "3C3H Scores": {
995
+ "Correctness": 0.7194,
996
+ "Completeness": 0.7181,
997
+ "Conciseness": 0.1927,
998
+ "Helpfulness": 0.6921,
999
+ "Honesty": 0.7099,
1000
+ "Harmlessness": 0.7194,
1001
+ "3C3H Score": 0.6253
1002
+ },
1003
+ "Tasks Scores": {
1004
+ "Question Answering (QA)": 0.6611,
1005
+ "Reasoning": 0.7922,
1006
+ "Orthographic and Grammatical Analysis": 0.0736,
1007
+ "Safety": 0.5741
1008
+ }
1009
+ },
1010
+ "Meta": {
1011
+ "Model Name": "rombodawg/Rombos-LLM-V2.5-Qwen-72b",
1012
+ "License": "qwen",
1013
+ "Revision": "main",
1014
+ "Precision": "bfloat16",
1015
+ "Params": 72.0,
1016
+ "Total Entries": 279,
1017
+ "Successful Entries": 272,
1018
+ "Failed Entries": 7,
1019
+ "Success Ratio": 0.9749
1020
+ }
1021
+ },
1022
+ {
1023
+ "claude-3.5-sonnet Scores": {
1024
+ "3C3H Scores": {
1025
+ "Correctness": 0.7121,
1026
+ "Completeness": 0.7097,
1027
+ "Conciseness": 0.1876,
1028
+ "Helpfulness": 0.6882,
1029
+ "Honesty": 0.6968,
1030
+ "Harmlessness": 0.7121,
1031
+ "3C3H Score": 0.6177
1032
+ },
1033
+ "Tasks Scores": {
1034
+ "Question Answering (QA)": 0.6815,
1035
+ "Reasoning": 0.7567,
1036
+ "Orthographic and Grammatical Analysis": 0.0,
1037
+ "Safety": 0.5667
1038
+ }
1039
+ },
1040
+ "Meta": {
1041
+ "Model Name": "MaziyarPanahi/calme-2.1-qwen2.5-72b",
1042
+ "License": "tongyi-qianwen",
1043
+ "Revision": "main",
1044
+ "Precision": "bfloat16",
1045
+ "Params": 72.0,
1046
+ "Total Entries": 279,
1047
+ "Successful Entries": 279,
1048
+ "Failed Entries": 0,
1049
+ "Success Ratio": 1.0
1050
+ }
1051
+ },
1052
+ {
1053
+ "claude-3.5-sonnet Scores": {
1054
+ "3C3H Scores": {
1055
+ "Correctness": 0.3285,
1056
+ "Completeness": 0.3225,
1057
+ "Conciseness": 0.0869,
1058
+ "Helpfulness": 0.2987,
1059
+ "Honesty": 0.3081,
1060
+ "Harmlessness": 0.3279,
1061
+ "3C3H Score": 0.2788
1062
+ },
1063
+ "Tasks Scores": {
1064
+ "Question Answering (QA)": 0.2945,
1065
+ "Reasoning": 0.3667,
1066
+ "Orthographic and Grammatical Analysis": 0.0,
1067
+ "Safety": 0.2625
1068
+ }
1069
+ },
1070
+ "Meta": {
1071
+ "Model Name": "inceptionai/jais-family-1p3b-chat",
1072
+ "License": "apache-2.0",
1073
+ "Revision": "main",
1074
+ "Precision": "float32",
1075
+ "Params": 1.0,
1076
+ "Total Entries": 279,
1077
+ "Successful Entries": 277,
1078
+ "Failed Entries": 2,
1079
+ "Success Ratio": 0.9928
1080
+ }
1081
+ },
1082
+ {
1083
+ "claude-3.5-sonnet Scores": {
1084
+ "3C3H Scores": {
1085
+ "Correctness": 0.5695,
1086
+ "Completeness": 0.5624,
1087
+ "Conciseness": 0.1577,
1088
+ "Helpfulness": 0.5312,
1089
+ "Honesty": 0.554,
1090
+ "Harmlessness": 0.5695,
1091
+ "3C3H Score": 0.4907
1092
+ },
1093
+ "Tasks Scores": {
1094
+ "Question Answering (QA)": 0.5702,
1095
+ "Reasoning": 0.5139,
1096
+ "Orthographic and Grammatical Analysis": 0.0,
1097
+ "Safety": 0.5604
1098
+ }
1099
+ },
1100
+ "Meta": {
1101
+ "Model Name": "inceptionai/jais-family-30b-16k-chat",
1102
+ "License": "apache-2.0",
1103
+ "Revision": "main",
1104
+ "Precision": "float32",
1105
+ "Params": 30.0,
1106
+ "Total Entries": 279,
1107
+ "Successful Entries": 278,
1108
+ "Failed Entries": 1,
1109
+ "Success Ratio": 0.9964
1110
+ }
1111
+ },
1112
+ {
1113
+ "claude-3.5-sonnet Scores": {
1114
+ "3C3H Scores": {
1115
+ "Correctness": 0.1966,
1116
+ "Completeness": 0.1535,
1117
+ "Conciseness": 0.0285,
1118
+ "Helpfulness": 0.1196,
1119
+ "Honesty": 0.1643,
1120
+ "Harmlessness": 0.1957,
1121
+ "3C3H Score": 0.143
1122
+ },
1123
+ "Tasks Scores": {
1124
+ "Question Answering (QA)": 0.1577,
1125
+ "Reasoning": 0.1872,
1126
+ "Orthographic and Grammatical Analysis": 0.0,
1127
+ "Safety": 0.0875
1128
+ }
1129
+ },
1130
+ "Meta": {
1131
+ "Model Name": "inceptionai/jais-family-590m-chat",
1132
+ "License": "apache-2.0",
1133
+ "Revision": "main",
1134
+ "Precision": "float32",
1135
+ "Params": 0.719,
1136
+ "Total Entries": 279,
1137
+ "Successful Entries": 278,
1138
+ "Failed Entries": 1,
1139
+ "Success Ratio": 0.9964
1140
+ }
1141
+ },
1142
+ {
1143
+ "claude-3.5-sonnet Scores": {
1144
+ "3C3H Scores": {
1145
+ "Correctness": 0.0791,
1146
+ "Completeness": 0.0504,
1147
+ "Conciseness": 0.0216,
1148
+ "Helpfulness": 0.0414,
1149
+ "Honesty": 0.0549,
1150
+ "Harmlessness": 0.0755,
1151
+ "3C3H Score": 0.0538
1152
+ },
1153
+ "Tasks Scores": {
1154
+ "Question Answering (QA)": 0.0293,
1155
+ "Reasoning": 0.0756,
1156
+ "Orthographic and Grammatical Analysis": 0.0,
1157
+ "Safety": 0.2417
1158
+ }
1159
+ },
1160
+ "Meta": {
1161
+ "Model Name": "meta-llama/Llama-3.2-1B-Instruct",
1162
+ "License": "llama3.2",
1163
+ "Revision": "main",
1164
+ "Precision": "bfloat16",
1165
+ "Params": 1.0,
1166
+ "Total Entries": 279,
1167
+ "Successful Entries": 278,
1168
+ "Failed Entries": 1,
1169
+ "Success Ratio": 0.9964
1170
+ }
1171
+ },
1172
+ {
1173
+ "claude-3.5-sonnet Scores": {
1174
+ "3C3H Scores": {
1175
+ "Correctness": 0.2736,
1176
+ "Completeness": 0.2616,
1177
+ "Conciseness": 0.0792,
1178
+ "Helpfulness": 0.1971,
1179
+ "Honesty": 0.2315,
1180
+ "Harmlessness": 0.2727,
1181
+ "3C3H Score": 0.2193
1182
+ },
1183
+ "Tasks Scores": {
1184
+ "Question Answering (QA)": 0.2133,
1185
+ "Reasoning": 0.28,
1186
+ "Orthographic and Grammatical Analysis": 0.0,
1187
+ "Safety": 0.3771
1188
+ }
1189
+ },
1190
+ "Meta": {
1191
+ "Model Name": "meta-llama/Llama-3.2-3B-Instruct",
1192
+ "License": "llama3.2",
1193
+ "Revision": "main",
1194
+ "Precision": "bfloat16",
1195
+ "Params": 3.0,
1196
+ "Total Entries": 279,
1197
+ "Successful Entries": 279,
1198
+ "Failed Entries": 0,
1199
+ "Success Ratio": 1.0
1200
+ }
1201
+ },
1202
+ {
1203
+ "claude-3.5-sonnet Scores": {
1204
+ "3C3H Scores": {
1205
+ "Correctness": 0.6296,
1206
+ "Completeness": 0.6165,
1207
+ "Conciseness": 0.2258,
1208
+ "Helpfulness": 0.5923,
1209
+ "Honesty": 0.6123,
1210
+ "Harmlessness": 0.6296,
1211
+ "3C3H Score": 0.551
1212
+ },
1213
+ "Tasks Scores": {
1214
+ "Question Answering (QA)": 0.6538,
1215
+ "Reasoning": 0.6033,
1216
+ "Orthographic and Grammatical Analysis": 0.0309,
1217
+ "Safety": 0.375
1218
+ }
1219
+ },
1220
+ "Meta": {
1221
+ "Model Name": "meta-llama/Llama-3.2-90B-Vision-Instruct",
1222
+ "License": "llama3.2",
1223
+ "Revision": "main",
1224
+ "Precision": "bfloat16",
1225
+ "Params": 90.0,
1226
+ "Total Entries": 279,
1227
+ "Successful Entries": 279,
1228
+ "Failed Entries": 0,
1229
+ "Success Ratio": 1.0
1230
+ }
1231
+ },
1232
+ {
1233
+ "claude-3.5-sonnet Scores": {
1234
+ "3C3H Scores": {
1235
+ "Correctness": 0.6858,
1236
+ "Completeness": 0.6511,
1237
+ "Conciseness": 0.345,
1238
+ "Helpfulness": 0.635,
1239
+ "Honesty": 0.6747,
1240
+ "Harmlessness": 0.6858,
1241
+ "3C3H Score": 0.6129
1242
+ },
1243
+ "Tasks Scores": {
1244
+ "Question Answering (QA)": 0.7062,
1245
+ "Reasoning": 0.6394,
1246
+ "Orthographic and Grammatical Analysis": 0.0215,
1247
+ "Safety": 0.7167
1248
+ }
1249
+ },
1250
+ "Meta": {
1251
+ "Model Name": "meta-llama/Llama-3.3-70B-Instruct",
1252
+ "License": "llama3.3",
1253
+ "Revision": "main",
1254
+ "Precision": "bfloat16",
1255
+ "Params": 70.0,
1256
+ "Total Entries": 279,
1257
+ "Successful Entries": 279,
1258
+ "Failed Entries": 0,
1259
+ "Success Ratio": 1.0
1260
+ }
1261
+ },
1262
+ {
1263
+ "claude-3.5-sonnet Scores": {
1264
+ "3C3H Scores": {
1265
+ "Correctness": 0.3321,
1266
+ "Completeness": 0.1434,
1267
+ "Conciseness": 0.0403,
1268
+ "Helpfulness": 0.1359,
1269
+ "Honesty": 0.2631,
1270
+ "Harmlessness": 0.3295,
1271
+ "3C3H Score": 0.2074
1272
+ },
1273
+ "Tasks Scores": {
1274
+ "Question Answering (QA)": 0.2891,
1275
+ "Reasoning": 0.1744,
1276
+ "Orthographic and Grammatical Analysis": 0.0175,
1277
+ "Safety": 0.0
1278
+ }
1279
+ },
1280
+ "Meta": {
1281
+ "Model Name": "stabilityai/ar-stablelm-2-chat",
1282
+ "License": "other",
1283
+ "Revision": "main",
1284
+ "Precision": "float32",
1285
+ "Params": 2.0,
1286
+ "Total Entries": 279,
1287
+ "Successful Entries": 279,
1288
+ "Failed Entries": 0,
1289
+ "Success Ratio": 1.0
1290
+ }
1291
+ },
1292
+ {
1293
+ "claude-3.5-sonnet Scores": {
1294
+ "3C3H Scores": {
1295
+ "Correctness": 0.5317,
1296
+ "Completeness": 0.4875,
1297
+ "Conciseness": 0.1711,
1298
+ "Helpfulness": 0.4271,
1299
+ "Honesty": 0.4904,
1300
+ "Harmlessness": 0.5317,
1301
+ "3C3H Score": 0.4399
1302
+ },
1303
+ "Tasks Scores": {
1304
+ "Question Answering (QA)": 0.4885,
1305
+ "Reasoning": 0.4211,
1306
+ "Orthographic and Grammatical Analysis": 0.0323,
1307
+ "Safety": 0.7708
1308
+ }
1309
+ },
1310
+ "Meta": {
1311
+ "Model Name": "utter-project/EuroLLM-9B-Instruct",
1312
+ "License": "apache-2.0",
1313
+ "Revision": "main",
1314
+ "Precision": "bfloat16",
1315
+ "Params": 9.0,
1316
+ "Total Entries": 279,
1317
+ "Successful Entries": 279,
1318
+ "Failed Entries": 0,
1319
+ "Success Ratio": 1.0
1320
+ }
1321
+ },
1322
+ {
1323
+ "claude-3.5-sonnet Scores": {
1324
+ "3C3H Scores": {
1325
+ "Correctness": 0.6619,
1326
+ "Completeness": 0.6356,
1327
+ "Conciseness": 0.1938,
1328
+ "Helpfulness": 0.6353,
1329
+ "Honesty": 0.6526,
1330
+ "Harmlessness": 0.661,
1331
+ "3C3H Score": 0.5734
1332
+ },
1333
+ "Tasks Scores": {
1334
+ "Question Answering (QA)": 0.7327,
1335
+ "Reasoning": 0.5506,
1336
+ "Orthographic and Grammatical Analysis": 0.0538,
1337
+ "Safety": 0.2458
1338
+ }
1339
+ },
1340
+ "Meta": {
1341
+ "Model Name": "CohereForAI/c4ai-command-r-plus-08-2024",
1342
+ "License": "cc-by-nc-4.0",
1343
+ "Revision": "main",
1344
+ "Precision": "float16",
1345
+ "Params": 104.0,
1346
+ "Total Entries": 279,
1347
+ "Successful Entries": 279,
1348
+ "Failed Entries": 0,
1349
+ "Success Ratio": 1.0
1350
+ }
1351
+ },
1352
+ {
1353
+ "_last_sync_timestamp": "2024-12-15T20:20:14.747963"
1354
  }
1355
  ]