LLMHallucination_Leaderboard / auto_leaderboard_scores.json
ramiroluo's picture
Update auto_leaderboard_scores.json
1777c38
{
"gpt4###gpt4": {
"Alpaca 7B": {
"nq": {
"abstain": 6.0,
"entailment": 13.99723355840377,
"neutral": 55.25707379117893,
"contradiction": 30.745692650417304
},
"msmarco": {
"abstain": 0.0,
"entailment": 60.09599567099567,
"neutral": 16.86785714285714,
"contradiction": 23.036147186147183
},
"dolly": {
"abstain": 6.0,
"entailment": 79.38211283955965,
"neutral": 10.892249323100387,
"contradiction": 9.725637837339965
},
"avg": {
"abstain": 4.0,
"entailment": 51.34464627954212,
"neutral": 27.44729891329156,
"contradiction": 21.208054807166327
}
},
"Baichuan 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 24.479416416916415,
"neutral": 43.261716061716065,
"contradiction": 32.25886752136752
},
"msmarco": {
"abstain": 0.0,
"entailment": 78.7480307274425,
"neutral": 15.489105339105338,
"contradiction": 5.762863933452168
},
"dolly": {
"abstain": 4.0,
"entailment": 85.07329620610871,
"neutral": 10.421267100954601,
"contradiction": 4.5054366929366925
},
"avg": {
"abstain": 1.3333333333333335,
"entailment": 62.46547685885921,
"neutral": 23.228120884370888,
"contradiction": 14.306402256769903
}
},
"ChatGLM 3 6B": {
"nq": {
"abstain": 1.0,
"entailment": 14.200552533885865,
"neutral": 47.79314433503381,
"contradiction": 38.006303131080315
},
"msmarco": {
"abstain": 0.0,
"entailment": 85.34064889788574,
"neutral": 8.92633828160144,
"contradiction": 5.73301282051282
},
"dolly": {
"abstain": 0.0,
"entailment": 89.29750652783859,
"neutral": 3.751092146836928,
"contradiction": 6.9514013253244755
},
"avg": {
"abstain": 0.33333333333333337,
"entailment": 63.10926502818439,
"neutral": 20.064429204054132,
"contradiction": 16.826305767761475
}
},
"GPT-3.5-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 59.383225108225105,
"neutral": 19.45919913419913,
"contradiction": 21.157575757575756
},
"msmarco": {
"abstain": 0.0,
"entailment": 71.1136028947388,
"neutral": 9.210206629070726,
"contradiction": 19.676190476190477
},
"dolly": {
"abstain": 0.0,
"entailment": 94.78053890553892,
"neutral": 3.2243145743145742,
"contradiction": 1.99514652014652
},
"avg": {
"abstain": 0.0,
"entailment": 75.09245563616761,
"neutral": 10.631240112528143,
"contradiction": 14.27630425130425
}
},
"Claude 2": {
"nq": {
"abstain": 0.0,
"entailment": 33.515945842587236,
"neutral": 56.08377297174671,
"contradiction": 10.400281185666048
},
"msmarco": {
"abstain": 0.0,
"entailment": 81.8466486944428,
"neutral": 15.355360407566291,
"contradiction": 2.797990897990898
},
"dolly": {
"abstain": 0.0,
"entailment": 90.57300115343594,
"neutral": 8.462604907170123,
"contradiction": 0.9643939393939394
},
"avg": {
"abstain": 0.0,
"entailment": 68.64519856348866,
"neutral": 26.633912762161042,
"contradiction": 4.720888674350295
}
},
"InstructGPT": {
"nq": {
"abstain": 0.0,
"entailment": 17.83611111111111,
"neutral": 25.714646464646464,
"contradiction": 56.44924242424243
},
"msmarco": {
"abstain": 0.0,
"entailment": 68.26282051282051,
"neutral": 14.649999999999999,
"contradiction": 17.087179487179487
},
"dolly": {
"abstain": 0.0,
"entailment": 83.57719502719503,
"neutral": 4.662121212121211,
"contradiction": 11.76068376068376
},
"avg": {
"abstain": 0.0,
"entailment": 56.55870888370889,
"neutral": 15.008922558922558,
"contradiction": 28.43236855736856
}
},
"Falcon 40B Instruct": {
"nq": {
"abstain": 0.0,
"entailment": 31.466666666666658,
"neutral": 21.15,
"contradiction": 47.38333333333333
},
"msmarco": {
"abstain": 0.0,
"entailment": 63.1717903828198,
"neutral": 18.362336601307188,
"contradiction": 18.465873015873015
},
"dolly": {
"abstain": 1.0,
"entailment": 79.68616961041204,
"neutral": 13.873018115442356,
"contradiction": 6.440812274145609
},
"avg": {
"abstain": 0.33333333333333337,
"entailment": 58.03604179391117,
"neutral": 17.808235630633817,
"contradiction": 24.155722575455016
}
},
"Gemini Pro (API)\u2020": {
"nq": {
"abstain": 16.0,
"entailment": 44.10430839002268,
"neutral": 12.655895691609977,
"contradiction": 43.23979591836735
},
"msmarco": {
"abstain": 5.0,
"entailment": 80.37009189640769,
"neutral": 7.900584795321638,
"contradiction": 11.729323308270676
},
"dolly": {
"abstain": 21.0,
"entailment": 88.43881856540084,
"neutral": 7.088607594936709,
"contradiction": 4.472573839662447
},
"avg": {
"abstain": 14.000000000000002,
"entailment": 71.03328411467946,
"neutral": 9.200196874615479,
"contradiction": 19.766519010705057
}
},
"GPT-4": {
"nq": {
"abstain": 0.0,
"entailment": 73.75205627705628,
"neutral": 14.564069264069266,
"contradiction": 11.68387445887446
},
"msmarco": {
"abstain": 0.0,
"entailment": 91.21498599439775,
"neutral": 6.654761904761905,
"contradiction": 2.1302521008403357
},
"dolly": {
"abstain": 0.0,
"entailment": 94.81666666666666,
"neutral": 3.116666666666667,
"contradiction": 2.0666666666666664
},
"avg": {
"abstain": 0.0,
"entailment": 86.59456964604023,
"neutral": 8.111832611832611,
"contradiction": 5.2935977421271545
}
},
"GPT-4-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 42.40319186000132,
"neutral": 51.7920209124493,
"contradiction": 5.804787227549376
},
"msmarco": {
"abstain": 0.0,
"entailment": 90.26384479813274,
"neutral": 6.928442081654156,
"contradiction": 2.80771312021312
},
"dolly": {
"abstain": 0.0,
"entailment": 92.30753437738731,
"neutral": 6.346387191240133,
"contradiction": 1.3460784313725491
},
"avg": {
"abstain": 0.0,
"entailment": 74.99152367850712,
"neutral": 21.688950061781195,
"contradiction": 3.3195262597116817
}
},
"InternLM 20B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 16.142521900097655,
"neutral": 24.271539347296923,
"contradiction": 59.585938752605415
},
"msmarco": {
"abstain": 0.0,
"entailment": 65.9702380952381,
"neutral": 16.333333333333332,
"contradiction": 17.69642857142857
},
"dolly": {
"abstain": 1.0,
"entailment": 93.67243867243869,
"neutral": 1.7316017316017316,
"contradiction": 4.595959595959596
},
"avg": {
"abstain": 0.6666666666666667,
"entailment": 58.61981512149297,
"neutral": 14.119611745450669,
"contradiction": 27.260573133056354
}
},
"LLaMA 2 7B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 13.783340375368242,
"neutral": 63.02361095528411,
"contradiction": 23.193048669347633
},
"msmarco": {
"abstain": 0.0,
"entailment": 79.93675946516504,
"neutral": 13.745760895451298,
"contradiction": 6.317479639383664
},
"dolly": {
"abstain": 0.0,
"entailment": 88.1102897102897,
"neutral": 7.378410478410478,
"contradiction": 4.5112998112998115
},
"avg": {
"abstain": 0.0,
"entailment": 60.61012985027433,
"neutral": 28.049260776381967,
"contradiction": 11.3406093733437
}
},
"LLaMA 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 23.01231170789994,
"neutral": 59.220105058340344,
"contradiction": 17.7675832337597
},
"msmarco": {
"abstain": 0.0,
"entailment": 80.37351545513309,
"neutral": 14.298593563299447,
"contradiction": 5.327890981567451
},
"dolly": {
"abstain": 0.0,
"entailment": 88.30580919080919,
"neutral": 7.055904095904094,
"contradiction": 4.638286713286713
},
"avg": {
"abstain": 0.0,
"entailment": 63.89721211794741,
"neutral": 26.858200905847962,
"contradiction": 9.244586976204625
}
},
"LLaMA 2 70B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 23.616815331211615,
"neutral": 62.14374898407405,
"contradiction": 14.239435684714321
},
"msmarco": {
"abstain": 0.0,
"entailment": 80.95581085581085,
"neutral": 13.398103285603286,
"contradiction": 5.646085858585859
},
"dolly": {
"abstain": 0.0,
"entailment": 91.00456349206348,
"neutral": 4.918849206349206,
"contradiction": 4.076587301587302
},
"avg": {
"abstain": 0.0,
"entailment": 65.19239655969533,
"neutral": 26.820233825342182,
"contradiction": 7.9873696149624935
}
},
"Mistral 7B Instruct": {
"nq": {
"abstain": 0.0,
"entailment": 21.008333333333333,
"neutral": 40.861111111111114,
"contradiction": 38.13055555555555
},
"msmarco": {
"abstain": 0.0,
"entailment": 81.84719274390328,
"neutral": 9.653496479154374,
"contradiction": 8.499310776942357
},
"dolly": {
"abstain": 0.0,
"entailment": 90.9826555797144,
"neutral": 4.769992752345694,
"contradiction": 4.247351667939903
},
"avg": {
"abstain": 0.0,
"entailment": 64.61272721898366,
"neutral": 18.428200114203726,
"contradiction": 16.959072666812606
}
},
"ERNIE Bot 4.0 (\u6587\u5fc3\u4e00\u8a004.0)": {
"nq": {
"abstain": 9.0,
"entailment": 34.585509118476146,
"neutral": 40.91025275091209,
"contradiction": 24.504238130611753
},
"msmarco": {
"abstain": 0.0,
"entailment": 84.2516825151113,
"neutral": 13.516968278539485,
"contradiction": 2.2313492063492064
},
"dolly": {
"abstain": 0.0,
"entailment": 94.52218810601164,
"neutral": 3.65296451914099,
"contradiction": 1.824847374847375
},
"avg": {
"abstain": 3.0,
"entailment": 72.2497195597719,
"neutral": 18.6935611000036,
"contradiction": 9.056719340224495
}
},
"Gemini Pro (Bard)*": {
"nq": {
"abstain": 0.0,
"entailment": 37.58214318622853,
"neutral": 56.106515555046656,
"contradiction": 6.311341258724823
},
"msmarco": {
"abstain": 0.0,
"entailment": 66.17400970976048,
"neutral": 29.125711679960904,
"contradiction": 4.70027861027861
},
"dolly": {
"abstain": 0.0,
"entailment": 81.6958587562942,
"neutral": 14.704152915040988,
"contradiction": 3.599988328664799
},
"avg": {
"abstain": 0.0,
"entailment": 61.817337217427735,
"neutral": 33.31212671668285,
"contradiction": 4.870536065889411
}
},
"Phi-2": {
"nq": {
"abstain": 0.0,
"entailment": 13.383297095061799,
"neutral": 34.92620549385255,
"contradiction": 51.690497411085644
},
"msmarco": {
"abstain": 1.0,
"entailment": 64.93630890182615,
"neutral": 8.344114378597137,
"contradiction": 26.71957671957672
},
"dolly": {
"abstain": 1.0,
"entailment": 81.9274636698879,
"neutral": 9.039372524221008,
"contradiction": 9.033163805891077
},
"avg": {
"abstain": 0.6666666666666667,
"entailment": 53.28135300035527,
"neutral": 17.49525420390689,
"contradiction": 29.22339279573784
}
}
},
"gpt4###claude2": {
"Alpaca 7B": {
"nq": {
"abstain": 6.0,
"entailment": 18.25731154188601,
"neutral": 42.409898976701236,
"contradiction": 39.33278948141276
},
"msmarco": {
"abstain": 0.0,
"entailment": 60.126154401154395,
"neutral": 10.367857142857142,
"contradiction": 29.50598845598845
},
"dolly": {
"abstain": 6.0,
"entailment": 82.814115234328,
"neutral": 7.595654856293153,
"contradiction": 9.590229909378847
},
"avg": {
"abstain": 4.0,
"entailment": 53.86572762874846,
"neutral": 19.92120755064995,
"contradiction": 26.213064820601584
}
},
"Baichuan 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 31.378349428349427,
"neutral": 31.939990564990563,
"contradiction": 36.68166000666001
},
"msmarco": {
"abstain": 0.0,
"entailment": 74.94037645361173,
"neutral": 8.740503777268483,
"contradiction": 16.31911976911977
},
"dolly": {
"abstain": 4.0,
"entailment": 85.24289512570763,
"neutral": 9.756367764180265,
"contradiction": 5.00073711011211
},
"avg": {
"abstain": 1.3333333333333335,
"entailment": 63.56483283872989,
"neutral": 16.907637633740574,
"contradiction": 19.527529527529527
}
},
"ChatGLM 3 6B": {
"nq": {
"abstain": 1.0,
"entailment": 18.259536726559897,
"neutral": 33.503915045804526,
"contradiction": 48.236548227635566
},
"msmarco": {
"abstain": 0.0,
"entailment": 75.95304330172752,
"neutral": 7.062425074925074,
"contradiction": 16.984531623347415
},
"dolly": {
"abstain": 0.0,
"entailment": 87.0074961850294,
"neutral": 5.291176337893605,
"contradiction": 7.701327477077003
},
"avg": {
"abstain": 0.33333333333333337,
"entailment": 60.547652456873315,
"neutral": 15.224908798717445,
"contradiction": 24.227438744409238
}
},
"GPT-3.5-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 62.99354256854257,
"neutral": 16.275,
"contradiction": 20.73145743145743
},
"msmarco": {
"abstain": 0.0,
"entailment": 67.44712248535778,
"neutral": 21.074516170104406,
"contradiction": 11.478361344537815
},
"dolly": {
"abstain": 0.0,
"entailment": 91.1153207903208,
"neutral": 4.937709512709513,
"contradiction": 3.946969696969697
},
"avg": {
"abstain": 0.0,
"entailment": 73.85199528140703,
"neutral": 14.095741894271304,
"contradiction": 12.052262824321646
}
},
"Claude 2": {
"nq": {
"abstain": 0.0,
"entailment": 40.896599279129696,
"neutral": 35.503456499403704,
"contradiction": 23.599944221466586
},
"msmarco": {
"abstain": 0.0,
"entailment": 80.31219292982483,
"neutral": 6.522612705716084,
"contradiction": 13.165194364459069
},
"dolly": {
"abstain": 0.0,
"entailment": 92.96594987138467,
"neutral": 2.945521990087208,
"contradiction": 4.088528138528139
},
"avg": {
"abstain": 0.0,
"entailment": 71.39158069344641,
"neutral": 14.990530398402333,
"contradiction": 13.617888908151265
}
},
"InstructGPT": {
"nq": {
"abstain": 0.0,
"entailment": 20.113888888888887,
"neutral": 20.6520202020202,
"contradiction": 59.23409090909091
},
"msmarco": {
"abstain": 0.0,
"entailment": 53.63418803418804,
"neutral": 19.26111111111111,
"contradiction": 27.10470085470085
},
"dolly": {
"abstain": 0.0,
"entailment": 79.65044979175414,
"neutral": 9.311912845608497,
"contradiction": 11.037637362637364
},
"avg": {
"abstain": 0.0,
"entailment": 51.13284223827702,
"neutral": 16.40834805291327,
"contradiction": 32.45880970880971
}
},
"Falcon 40B Instruct": {
"nq": {
"abstain": 0.0,
"entailment": 34.53333333333333,
"neutral": 27.166666666666668,
"contradiction": 38.3
},
"msmarco": {
"abstain": 0.0,
"entailment": 63.6829365079365,
"neutral": 17.644444444444446,
"contradiction": 18.672619047619047
},
"dolly": {
"abstain": 1.0,
"entailment": 80.57762383519959,
"neutral": 10.77410126530361,
"contradiction": 8.648274899496794
},
"avg": {
"abstain": 0.33333333333333337,
"entailment": 59.527798474286776,
"neutral": 18.554338248749723,
"contradiction": 21.917863276963505
}
},
"Gemini Pro (API)\u2020": {
"nq": {
"abstain": 16.0,
"entailment": 47.250566893424036,
"neutral": 5.938208616780045,
"contradiction": 46.81122448979592
},
"msmarco": {
"abstain": 5.0,
"entailment": 76.76587301587303,
"neutral": 13.350668337510443,
"contradiction": 9.88345864661654
},
"dolly": {
"abstain": 21.0,
"entailment": 89.40928270042195,
"neutral": 8.755274261603375,
"contradiction": 1.8354430379746836
},
"avg": {
"abstain": 14.000000000000002,
"entailment": 71.02767011197244,
"neutral": 9.530192567983265,
"contradiction": 19.4421373200443
}
},
"GPT-4": {
"nq": {
"abstain": 0.0,
"entailment": 76.63506493506493,
"neutral": 8.206493506493505,
"contradiction": 15.158441558441558
},
"msmarco": {
"abstain": 0.0,
"entailment": 77.60539558480735,
"neutral": 10.07580099638923,
"contradiction": 12.31880341880342
},
"dolly": {
"abstain": 0.0,
"entailment": 89.10139439507861,
"neutral": 5.357177033492823,
"contradiction": 5.541428571428571
},
"avg": {
"abstain": 0.0,
"entailment": 81.11395163831698,
"neutral": 7.879823845458519,
"contradiction": 11.006224516224515
}
},
"GPT-4-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 49.57831732774188,
"neutral": 35.823720564033856,
"contradiction": 14.597962108224257
},
"msmarco": {
"abstain": 0.0,
"entailment": 78.94691601398256,
"neutral": 8.053141547994489,
"contradiction": 12.999942438022932
},
"dolly": {
"abstain": 0.0,
"entailment": 87.42128714561842,
"neutral": 7.118279540886718,
"contradiction": 5.46043331349486
},
"avg": {
"abstain": 0.0,
"entailment": 71.98217349578096,
"neutral": 16.99838055097169,
"contradiction": 11.019445953247352
}
},
"InternLM 20B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 18.76998702756278,
"neutral": 15.326096462460098,
"contradiction": 65.90391650997712
},
"msmarco": {
"abstain": 0.0,
"entailment": 69.16785714285714,
"neutral": 5.589285714285714,
"contradiction": 25.24285714285714
},
"dolly": {
"abstain": 1.0,
"entailment": 90.78540137679923,
"neutral": 1.2027598049103423,
"contradiction": 8.011838818290432
},
"avg": {
"abstain": 0.6666666666666667,
"entailment": 59.60660794066294,
"neutral": 7.366729335229009,
"contradiction": 33.02666272410806
}
},
"LLaMA 2 7B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 19.98762667357869,
"neutral": 42.64563110601777,
"contradiction": 37.36674222040355
},
"msmarco": {
"abstain": 0.0,
"entailment": 75.41355530434477,
"neutral": 9.799732925313421,
"contradiction": 14.786711770341801
},
"dolly": {
"abstain": 0.0,
"entailment": 86.99999167499168,
"neutral": 8.07666638916639,
"contradiction": 4.923341935841935
},
"avg": {
"abstain": 0.0,
"entailment": 60.800391217638385,
"neutral": 20.17401014016586,
"contradiction": 19.02559864219576
}
},
"LLaMA 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 28.03634641502289,
"neutral": 40.386883786148495,
"contradiction": 31.576769798828618
},
"msmarco": {
"abstain": 0.0,
"entailment": 73.92040010642953,
"neutral": 9.47142253171665,
"contradiction": 16.60817736185383
},
"dolly": {
"abstain": 0.0,
"entailment": 86.0424295149295,
"neutral": 7.826956654456656,
"contradiction": 6.13061383061383
},
"avg": {
"abstain": 0.0,
"entailment": 62.666392012127304,
"neutral": 19.22842099077393,
"contradiction": 18.105186997098766
}
},
"LLaMA 2 70B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 33.01317670977114,
"neutral": 43.62549894635808,
"contradiction": 23.361324343870784
},
"msmarco": {
"abstain": 0.0,
"entailment": 78.45617536058714,
"neutral": 6.950282070870307,
"contradiction": 14.593542568542569
},
"dolly": {
"abstain": 0.0,
"entailment": 88.81219336219337,
"neutral": 4.924440836940836,
"contradiction": 6.2633658008658
},
"avg": {
"abstain": 0.0,
"entailment": 66.76051514418387,
"neutral": 18.50007395138974,
"contradiction": 14.739410904426384
}
},
"Mistral 7B Instruct": {
"nq": {
"abstain": 0.0,
"entailment": 25.72936507936508,
"neutral": 31.798484848484847,
"contradiction": 42.47215007215007
},
"msmarco": {
"abstain": 0.0,
"entailment": 78.53683567474403,
"neutral": 9.690405651181512,
"contradiction": 11.772758674074465
},
"dolly": {
"abstain": 0.0,
"entailment": 90.47255195784608,
"neutral": 4.876535804349103,
"contradiction": 4.65091223780482
},
"avg": {
"abstain": 0.0,
"entailment": 64.91291757065171,
"neutral": 15.455142101338486,
"contradiction": 19.631940328009787
}
},
"ERNIE Bot 4.0 (\u6587\u5fc3\u4e00\u8a004.0)": {
"nq": {
"abstain": 9.0,
"entailment": 38.88431775794413,
"neutral": 32.46784808597995,
"contradiction": 28.647834156075913
},
"msmarco": {
"abstain": 0.0,
"entailment": 82.03860673640085,
"neutral": 6.29840518958166,
"contradiction": 11.662988074017486
},
"dolly": {
"abstain": 0.0,
"entailment": 91.23029111411465,
"neutral": 4.810335905924141,
"contradiction": 3.959372979961215
},
"avg": {
"abstain": 3.0,
"entailment": 71.70227732310812,
"neutral": 13.970612664518061,
"contradiction": 14.327110012373808
}
},
"Gemini Pro (Bard)*": {
"nq": {
"abstain": 0.0,
"entailment": 43.97499796217154,
"neutral": 36.883149077108556,
"contradiction": 19.141852960719902
},
"msmarco": {
"abstain": 0.0,
"entailment": 62.28904125200564,
"neutral": 20.13327764770489,
"contradiction": 17.57768110028946
},
"dolly": {
"abstain": 0.0,
"entailment": 82.57194937846776,
"neutral": 11.374005503090844,
"contradiction": 6.054045118441402
},
"avg": {
"abstain": 0.0,
"entailment": 62.945329530881644,
"neutral": 22.796810742634765,
"contradiction": 14.257859726483588
}
},
"Phi-2": {
"nq": {
"abstain": 0.0,
"entailment": 15.251384399913812,
"neutral": 20.070571585277467,
"contradiction": 64.67804401480873
},
"msmarco": {
"abstain": 1.0,
"entailment": 57.61199847406744,
"neutral": 13.316332454263488,
"contradiction": 29.07166907166907
},
"dolly": {
"abstain": 1.0,
"entailment": 83.9150856240874,
"neutral": 9.129040563978176,
"contradiction": 6.955873811934418
},
"avg": {
"abstain": 0.6666666666666667,
"entailment": 52.13530122721043,
"neutral": 14.191775460851247,
"contradiction": 33.67292331193832
}
}
},
"gpt4###nli": {
"Alpaca 7B": {
"nq": {
"abstain": 6.0,
"entailment": 39.32247447657334,
"neutral": 38.358259995462745,
"contradiction": 22.3192655279639
},
"msmarco": {
"abstain": 0.0,
"entailment": 84.05039682539682,
"neutral": 6.4722222222222205,
"contradiction": 9.477380952380951
},
"dolly": {
"abstain": 6.0,
"entailment": 87.36364403917595,
"neutral": 4.80580884836204,
"contradiction": 7.830547112462007
},
"avg": {
"abstain": 4.0,
"entailment": 70.53310702437541,
"neutral": 16.33557185257553,
"contradiction": 13.131321123049064
}
},
"Baichuan 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 46.298579198579205,
"neutral": 34.834310134310144,
"contradiction": 18.867110667110666
},
"msmarco": {
"abstain": 0.0,
"entailment": 93.89758297258298,
"neutral": 4.43531746031746,
"contradiction": 1.6670995670995674
},
"dolly": {
"abstain": 4.0,
"entailment": 90.07618538868539,
"neutral": 5.7987541971916965,
"contradiction": 4.125060414122914
},
"avg": {
"abstain": 1.3333333333333335,
"entailment": 76.57746626496626,
"neutral": 15.147443116193116,
"contradiction": 8.275090618840618
}
},
"ChatGLM 3 6B": {
"nq": {
"abstain": 1.0,
"entailment": 40.357838009353166,
"neutral": 36.34073278012672,
"contradiction": 23.301429210520126
},
"msmarco": {
"abstain": 0.0,
"entailment": 94.03385225885225,
"neutral": 3.773214285714286,
"contradiction": 2.1929334554334554
},
"dolly": {
"abstain": 0.0,
"entailment": 91.09916302335658,
"neutral": 5.4497557997558,
"contradiction": 3.4510811768876293
},
"avg": {
"abstain": 0.33333333333333337,
"entailment": 75.28002505400283,
"neutral": 15.117155698259378,
"contradiction": 9.602819247737795
}
},
"GPT-3.5-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 68.39837662337662,
"neutral": 20.376190476190477,
"contradiction": 11.2254329004329
},
"msmarco": {
"abstain": 0.0,
"entailment": 84.8015873015873,
"neutral": 3.198412698412698,
"contradiction": 12.0
},
"dolly": {
"abstain": 0.0,
"entailment": 93.07893772893772,
"neutral": 2.083333333333333,
"contradiction": 4.837728937728937
},
"avg": {
"abstain": 0.0,
"entailment": 82.09296721796721,
"neutral": 8.552645502645504,
"contradiction": 9.35438727938728
}
},
"Claude 2": {
"nq": {
"abstain": 0.0,
"entailment": 52.74511413642406,
"neutral": 36.94317770159689,
"contradiction": 10.31170816197906
},
"msmarco": {
"abstain": 0.0,
"entailment": 91.50773074964252,
"neutral": 3.927052522640758,
"contradiction": 4.565216727716727
},
"dolly": {
"abstain": 0.0,
"entailment": 93.21219336219335,
"neutral": 3.090151515151515,
"contradiction": 3.6976551226551226
},
"avg": {
"abstain": 0.0,
"entailment": 79.15501274941997,
"neutral": 14.653460579796384,
"contradiction": 6.191526670783637
}
},
"InstructGPT": {
"nq": {
"abstain": 0.0,
"entailment": 35.4520202020202,
"neutral": 37.351010101010104,
"contradiction": 27.196969696969695
},
"msmarco": {
"abstain": 0.0,
"entailment": 86.46666666666667,
"neutral": 3.366666666666667,
"contradiction": 10.166666666666668
},
"dolly": {
"abstain": 0.0,
"entailment": 91.43393719806764,
"neutral": 4.747222222222222,
"contradiction": 3.818840579710145
},
"avg": {
"abstain": 0.0,
"entailment": 71.11754135558483,
"neutral": 15.154966329966326,
"contradiction": 13.727492314448837
}
},
"Falcon 40B Instruct": {
"nq": {
"abstain": 0.0,
"entailment": 48.08333333333333,
"neutral": 24.966666666666665,
"contradiction": 26.950000000000003
},
"msmarco": {
"abstain": 0.0,
"entailment": 87.07539682539684,
"neutral": 4.231944444444444,
"contradiction": 8.69265873015873
},
"dolly": {
"abstain": 1.0,
"entailment": 89.45004770762347,
"neutral": 6.700992610083518,
"contradiction": 3.8489596822930157
},
"avg": {
"abstain": 0.33333333333333337,
"entailment": 74.82082855828676,
"neutral": 11.984145081971167,
"contradiction": 13.19502635974208
}
},
"Gemini Pro (API)\u2020": {
"nq": {
"abstain": 16.0,
"entailment": 60.01984126984128,
"neutral": 26.927437641723355,
"contradiction": 13.052721088435373
},
"msmarco": {
"abstain": 5.0,
"entailment": 92.18045112781955,
"neutral": 0.6265664160401002,
"contradiction": 7.192982456140351
},
"dolly": {
"abstain": 21.0,
"entailment": 91.98312236286921,
"neutral": 2.5105485232067513,
"contradiction": 5.5063291139240516
},
"avg": {
"abstain": 14.000000000000002,
"entailment": 81.64913252122554,
"neutral": 9.766519010705057,
"contradiction": 8.584348468069397
}
},
"GPT-4": {
"nq": {
"abstain": 0.0,
"entailment": 72.07348484848485,
"neutral": 15.993506493506493,
"contradiction": 11.933008658008657
},
"msmarco": {
"abstain": 0.0,
"entailment": 90.64404761904763,
"neutral": 1.6726190476190474,
"contradiction": 7.683333333333334
},
"dolly": {
"abstain": 0.0,
"entailment": 92.01666666666667,
"neutral": 2.7333333333333334,
"contradiction": 5.25
},
"avg": {
"abstain": 0.0,
"entailment": 84.91139971139971,
"neutral": 6.799819624819625,
"contradiction": 8.288780663780663
}
},
"GPT-4-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 56.9450380537337,
"neutral": 35.9172510460554,
"contradiction": 7.1377109002109
},
"msmarco": {
"abstain": 0.0,
"entailment": 92.6530035324153,
"neutral": 2.473513986013986,
"contradiction": 4.873482481570717
},
"dolly": {
"abstain": 0.0,
"entailment": 91.88844643918173,
"neutral": 4.50254329004329,
"contradiction": 3.6090102707749776
},
"avg": {
"abstain": 0.0,
"entailment": 80.49549600844358,
"neutral": 14.297769440704222,
"contradiction": 5.2067345508521985
}
},
"InternLM 20B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 38.36521783491481,
"neutral": 40.7184398093489,
"contradiction": 20.916342355736298
},
"msmarco": {
"abstain": 0.0,
"entailment": 90.1,
"neutral": 3.0,
"contradiction": 6.9
},
"dolly": {
"abstain": 1.0,
"entailment": 94.17997176061694,
"neutral": 2.8956228956228958,
"contradiction": 2.9244053437601822
},
"avg": {
"abstain": 0.6666666666666667,
"entailment": 74.26836835556256,
"neutral": 15.495947006014118,
"contradiction": 10.235684638423326
}
},
"LLaMA 2 7B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 39.45354032269162,
"neutral": 41.36536239519144,
"contradiction": 19.181097282116934
},
"msmarco": {
"abstain": 0.0,
"entailment": 90.75689092469278,
"neutral": 4.089282509715947,
"contradiction": 5.15382656559127
},
"dolly": {
"abstain": 0.0,
"entailment": 91.38357198357197,
"neutral": 6.125280275280275,
"contradiction": 2.491147741147741
},
"avg": {
"abstain": 0.0,
"entailment": 73.86466774365213,
"neutral": 17.19330839339589,
"contradiction": 8.942023862951984
}
},
"LLaMA 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 44.208278404601934,
"neutral": 40.353214677479386,
"contradiction": 15.438506917918685
},
"msmarco": {
"abstain": 0.0,
"entailment": 90.78491092241092,
"neutral": 4.708014208014208,
"contradiction": 4.507074869574869
},
"dolly": {
"abstain": 0.0,
"entailment": 89.71058136308135,
"neutral": 7.08147102897103,
"contradiction": 3.207947607947608
},
"avg": {
"abstain": 0.0,
"entailment": 74.90125689669807,
"neutral": 17.380899971488205,
"contradiction": 7.71784313181372
}
},
"LLaMA 2 70B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 46.93413867264486,
"neutral": 41.25886271451287,
"contradiction": 11.806998612842266
},
"msmarco": {
"abstain": 0.0,
"entailment": 90.40409729159728,
"neutral": 3.5948565323565322,
"contradiction": 6.0010461760461755
},
"dolly": {
"abstain": 0.0,
"entailment": 92.89645863395864,
"neutral": 3.8927248677248674,
"contradiction": 3.2108164983164973
},
"avg": {
"abstain": 0.0,
"entailment": 76.74489819940025,
"neutral": 16.248814704864756,
"contradiction": 7.006287095734981
}
},
"Mistral 7B Instruct": {
"nq": {
"abstain": 0.0,
"entailment": 41.6274531024531,
"neutral": 36.400685425685424,
"contradiction": 21.971861471861473
},
"msmarco": {
"abstain": 0.0,
"entailment": 91.71748436748436,
"neutral": 3.015873015873016,
"contradiction": 5.266642616642617
},
"dolly": {
"abstain": 0.0,
"entailment": 91.86404151404152,
"neutral": 4.048840048840049,
"contradiction": 4.087118437118438
},
"avg": {
"abstain": 0.0,
"entailment": 75.06965966132633,
"neutral": 14.488466163466162,
"contradiction": 10.44187417520751
}
},
"ERNIE Bot 4.0 (\u6587\u5fc3\u4e00\u8a004.0)": {
"nq": {
"abstain": 9.0,
"entailment": 55.757173229700705,
"neutral": 30.82707432982158,
"contradiction": 13.415752440477716
},
"msmarco": {
"abstain": 0.0,
"entailment": 92.53871041239462,
"neutral": 3.835459861775651,
"contradiction": 3.625829725829726
},
"dolly": {
"abstain": 0.0,
"entailment": 93.59346405228757,
"neutral": 3.4945378151260504,
"contradiction": 2.911998132586368
},
"avg": {
"abstain": 3.0,
"entailment": 81.39903852361162,
"neutral": 12.158981208604583,
"contradiction": 6.441980267783785
}
},
"Gemini Pro (Bard)*": {
"nq": {
"abstain": 0.0,
"entailment": 52.405351271610186,
"neutral": 38.84090592742835,
"contradiction": 8.75374280096147
},
"msmarco": {
"abstain": 0.0,
"entailment": 82.35172429393793,
"neutral": 8.621489119755374,
"contradiction": 9.02678658630671
},
"dolly": {
"abstain": 0.0,
"entailment": 87.77936765265812,
"neutral": 8.346139296533025,
"contradiction": 3.87449305080884
},
"avg": {
"abstain": 0.0,
"entailment": 74.17881440606874,
"neutral": 18.60284478123891,
"contradiction": 7.21834081269234
}
},
"Phi-2": {
"nq": {
"abstain": 0.0,
"entailment": 37.42376349141055,
"neutral": 31.860378347143055,
"contradiction": 30.7158581614464
},
"msmarco": {
"abstain": 1.0,
"entailment": 84.80563710448767,
"neutral": 1.7716851050184383,
"contradiction": 13.42267779049388
},
"dolly": {
"abstain": 1.0,
"entailment": 90.86461658431355,
"neutral": 4.162038859008556,
"contradiction": 4.97334455667789
},
"avg": {
"abstain": 0.6666666666666667,
"entailment": 70.91856196084692,
"neutral": 12.662672842795228,
"contradiction": 16.41876519635787
}
}
},
"claude2###gpt4": {
"Alpaca 7B": {
"nq": {
"abstain": 14.000000000000002,
"entailment": 17.279784928553738,
"neutral": 54.678136026460244,
"contradiction": 28.042079044986018
},
"msmarco": {
"abstain": 2.0,
"entailment": 60.135244574020085,
"neutral": 19.745707806932298,
"contradiction": 20.11904761904762
},
"dolly": {
"abstain": 8.0,
"entailment": 80.05064229249012,
"neutral": 8.950785024154587,
"contradiction": 10.998572683355292
},
"avg": {
"abstain": 8.0,
"entailment": 53.42019769209666,
"neutral": 27.032142339047706,
"contradiction": 19.547659968855623
}
},
"Baichuan 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 24.539521589521584,
"neutral": 45.49135133871976,
"contradiction": 29.96912707175865
},
"msmarco": {
"abstain": 1.0,
"entailment": 80.04945671612337,
"neutral": 12.53878837212171,
"contradiction": 7.411754911754912
},
"dolly": {
"abstain": 5.0,
"entailment": 86.96112737044788,
"neutral": 9.479196925139572,
"contradiction": 3.559675704412547
},
"avg": {
"abstain": 2.0,
"entailment": 63.40188936748613,
"neutral": 22.75849962789212,
"contradiction": 13.839611004621746
}
},
"ChatGLM 3 6B": {
"nq": {
"abstain": 6.0,
"entailment": 15.019736295549812,
"neutral": 52.191161119509054,
"contradiction": 32.78910258494113
},
"msmarco": {
"abstain": 1.0,
"entailment": 82.45668774072,
"neutral": 9.93057057955266,
"contradiction": 7.612741679727325
},
"dolly": {
"abstain": 0.0,
"entailment": 90.82091662355887,
"neutral": 4.190149105280684,
"contradiction": 4.988934271160438
},
"avg": {
"abstain": 2.333333333333333,
"entailment": 63.67631044528618,
"neutral": 21.529387519241066,
"contradiction": 14.79430203547275
}
},
"GPT-3.5-Turbo": {
"nq": {
"abstain": 2.0,
"entailment": 59.695224813377344,
"neutral": 21.20497610159264,
"contradiction": 19.09979908503002
},
"msmarco": {
"abstain": 27.0,
"entailment": 88.15682024281583,
"neutral": 6.343433435520053,
"contradiction": 5.499746321664131
},
"dolly": {
"abstain": 1.0,
"entailment": 95.4234808401475,
"neutral": 2.4681337181337186,
"contradiction": 2.108385441718775
},
"avg": {
"abstain": 10.000000000000002,
"entailment": 80.49075745411533,
"neutral": 10.31667976608993,
"contradiction": 9.192562779794747
}
},
"Claude 2": {
"nq": {
"abstain": 4.0,
"entailment": 32.109102264453135,
"neutral": 58.42244482253582,
"contradiction": 9.468452913011049
},
"msmarco": {
"abstain": 5.0,
"entailment": 84.18158779373542,
"neutral": 11.64128936350706,
"contradiction": 4.177122842757518
},
"dolly": {
"abstain": 2.0,
"entailment": 92.85887729543452,
"neutral": 3.7146200688791637,
"contradiction": 3.426502635686309
},
"avg": {
"abstain": 3.6666666666666665,
"entailment": 69.82662502679912,
"neutral": 24.493114045836563,
"contradiction": 5.6802609273643005
}
},
"InstructGPT": {
"nq": {
"abstain": 3.0,
"entailment": 21.451071837669776,
"neutral": 31.415443090700823,
"contradiction": 47.1334850716294
},
"msmarco": {
"abstain": 10.0,
"entailment": 65.95308395308395,
"neutral": 16.18953268953269,
"contradiction": 17.85738335738336
},
"dolly": {
"abstain": 1.0,
"entailment": 83.58174233174232,
"neutral": 6.273649190315857,
"contradiction": 10.144608477941812
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 56.961972079979084,
"neutral": 17.921144026913257,
"contradiction": 25.116883893107673
}
},
"Falcon 40B Instruct": {
"nq": {
"abstain": 27.0,
"entailment": 42.71689497716895,
"neutral": 11.1986301369863,
"contradiction": 46.08447488584474
},
"msmarco": {
"abstain": 24.0,
"entailment": 66.49651088505578,
"neutral": 17.55737382672367,
"contradiction": 15.94611528822055
},
"dolly": {
"abstain": 1.0,
"entailment": 78.13956105622773,
"neutral": 13.84068092401426,
"contradiction": 8.019758019758019
},
"avg": {
"abstain": 17.333333333333336,
"entailment": 64.14469639179079,
"neutral": 14.201967025437137,
"contradiction": 21.653336582772067
}
},
"Gemini Pro (API)\u2020": {
"nq": {
"abstain": 16.0,
"entailment": 46.36243386243386,
"neutral": 13.412698412698415,
"contradiction": 40.22486772486773
},
"msmarco": {
"abstain": 23.0,
"entailment": 86.50919787283424,
"neutral": 7.618896255259891,
"contradiction": 5.871905871905872
},
"dolly": {
"abstain": 20.0,
"entailment": 86.81628787878788,
"neutral": 10.114267676767676,
"contradiction": 3.0694444444444446
},
"avg": {
"abstain": 19.666666666666668,
"entailment": 72.61807348944279,
"neutral": 10.466651835946442,
"contradiction": 16.91527467461077
}
},
"GPT-4": {
"nq": {
"abstain": 0.0,
"entailment": 74.10079365079365,
"neutral": 16.965873015873015,
"contradiction": 8.933333333333334
},
"msmarco": {
"abstain": 10.0,
"entailment": 96.63786322609853,
"neutral": 2.4634439634439635,
"contradiction": 0.8986928104575163
},
"dolly": {
"abstain": 4.0,
"entailment": 97.38380832130832,
"neutral": 1.730324074074074,
"contradiction": 0.8858676046176047
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 89.00815613382458,
"neutral": 7.2881411517775145,
"contradiction": 3.7037027143979016
}
},
"GPT-4-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 38.756567860244324,
"neutral": 56.47508194419959,
"contradiction": 4.768350195556078
},
"msmarco": {
"abstain": 2.0,
"entailment": 90.57744947795968,
"neutral": 7.354162560795214,
"contradiction": 2.0683879612451035
},
"dolly": {
"abstain": 2.0,
"entailment": 93.14978472767379,
"neutral": 5.756699944117017,
"contradiction": 1.0935153282092056
},
"avg": {
"abstain": 1.3333333333333335,
"entailment": 73.92204641275849,
"neutral": 23.420178107774863,
"contradiction": 2.657775479466656
}
},
"InternLM 20B Chat": {
"nq": {
"abstain": 5.0,
"entailment": 22.164449585502215,
"neutral": 20.386543281280122,
"contradiction": 57.449007133217656
},
"msmarco": {
"abstain": 17.0,
"entailment": 75.22470835723848,
"neutral": 6.1675272518646,
"contradiction": 18.607764390896918
},
"dolly": {
"abstain": 4.0,
"entailment": 92.95386904761904,
"neutral": 3.3482142857142856,
"contradiction": 3.6979166666666665
},
"avg": {
"abstain": 8.666666666666668,
"entailment": 63.03958004687931,
"neutral": 10.1096895804925,
"contradiction": 26.850730372628185
}
},
"LLaMA 2 7B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 13.652765517611309,
"neutral": 64.02558767111891,
"contradiction": 22.321646811269783
},
"msmarco": {
"abstain": 4.0,
"entailment": 79.73895543498725,
"neutral": 13.48059324771681,
"contradiction": 6.780451317295946
},
"dolly": {
"abstain": 2.0,
"entailment": 84.89321695757202,
"neutral": 8.823196181110037,
"contradiction": 6.2835868613179535
},
"avg": {
"abstain": 2.3333333333333335,
"entailment": 59.133442900492675,
"neutral": 29.001171866793072,
"contradiction": 11.865385232714264
}
},
"LLaMA 2 13B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 20.24408299548298,
"neutral": 62.712803116670976,
"contradiction": 17.043113887846037
},
"msmarco": {
"abstain": 7.000000000000001,
"entailment": 79.32203847600931,
"neutral": 15.867196375789236,
"contradiction": 4.81076514820148
},
"dolly": {
"abstain": 1.0,
"entailment": 87.17030699590855,
"neutral": 6.6248289922098955,
"contradiction": 6.204864011881556
},
"avg": {
"abstain": 3.0,
"entailment": 61.89338208734234,
"neutral": 28.66005100250037,
"contradiction": 9.446566910157285
}
},
"LLaMA 2 70B Chat": {
"nq": {
"abstain": 6.0,
"entailment": 23.479110830726334,
"neutral": 64.6221323496783,
"contradiction": 11.898756819595366
},
"msmarco": {
"abstain": 4.0,
"entailment": 84.34749226970695,
"neutral": 12.803920478173195,
"contradiction": 2.8485872521198607
},
"dolly": {
"abstain": 0.0,
"entailment": 91.5227342102342,
"neutral": 4.925252525252525,
"contradiction": 3.5520132645132643
},
"avg": {
"abstain": 3.3333333333333335,
"entailment": 67.09196240346057,
"neutral": 26.883386411378062,
"contradiction": 6.024651185161371
}
},
"Mistral 7B Instruct": {
"nq": {
"abstain": 1.0,
"entailment": 22.038257757954728,
"neutral": 40.19017738714709,
"contradiction": 37.77156485489818
},
"msmarco": {
"abstain": 7.000000000000001,
"entailment": 82.92628303651476,
"neutral": 10.929168560872345,
"contradiction": 6.144548402612919
},
"dolly": {
"abstain": 0.0,
"entailment": 89.23142123614261,
"neutral": 6.38880129158767,
"contradiction": 4.379777472269732
},
"avg": {
"abstain": 2.666666666666667,
"entailment": 64.44203412345085,
"neutral": 19.29493276249129,
"contradiction": 16.26303311405786
}
},
"ERNIE Bot 4.0 (\u6587\u5fc3\u4e00\u8a004.0)": {
"nq": {
"abstain": 17.0,
"entailment": 36.236310389624414,
"neutral": 39.48460494592593,
"contradiction": 24.279084664449655
},
"msmarco": {
"abstain": 3.0,
"entailment": 84.34635482279727,
"neutral": 13.088403100294885,
"contradiction": 2.5652420769078392
},
"dolly": {
"abstain": 0.0,
"entailment": 95.51375944317121,
"neutral": 2.826283846872082,
"contradiction": 1.65995670995671
},
"avg": {
"abstain": 6.666666666666667,
"entailment": 74.07352187309743,
"neutral": 17.247948914027376,
"contradiction": 8.678529212875187
}
},
"Gemini Pro (Bard)*": {
"nq": {
"abstain": 2.0,
"entailment": 35.72629530578894,
"neutral": 56.24796027594946,
"contradiction": 8.025744418261608
},
"msmarco": {
"abstain": 10.0,
"entailment": 67.06362111070854,
"neutral": 26.784287950951818,
"contradiction": 6.152090938339648
},
"dolly": {
"abstain": 2.0,
"entailment": 81.07176280231029,
"neutral": 15.255101992599624,
"contradiction": 3.6731352050900923
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 61.1256489320192,
"neutral": 32.92967139127088,
"contradiction": 5.944679676709912
}
},
"Phi-2": {
"nq": {
"abstain": 3.0,
"entailment": 17.055900773305265,
"neutral": 33.77093292799781,
"contradiction": 49.17316629869692
},
"msmarco": {
"abstain": 18.0,
"entailment": 68.90054051639417,
"neutral": 9.981573304744037,
"contradiction": 21.117886178861788
},
"dolly": {
"abstain": 3.0,
"entailment": 83.7804749759069,
"neutral": 10.46476432087879,
"contradiction": 5.754760703214312
},
"avg": {
"abstain": 8.0,
"entailment": 55.90932163049964,
"neutral": 18.512143638152327,
"contradiction": 25.57853473134803
}
}
},
"claude2###claude2": {
"Alpaca 7B": {
"nq": {
"abstain": 14.000000000000002,
"entailment": 23.131896603168833,
"neutral": 36.882768679109304,
"contradiction": 39.98533471772186
},
"msmarco": {
"abstain": 2.0,
"entailment": 59.7633404010955,
"neutral": 11.811224489795917,
"contradiction": 28.425435109108584
},
"dolly": {
"abstain": 8.0,
"entailment": 80.78606327247631,
"neutral": 10.320910973084887,
"contradiction": 8.893025754438797
},
"avg": {
"abstain": 8.0,
"entailment": 55.356769160317754,
"neutral": 19.126601144663805,
"contradiction": 25.516629695018445
}
},
"Baichuan 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 30.435129636445424,
"neutral": 34.451116573485,
"contradiction": 35.11375379006959
},
"msmarco": {
"abstain": 1.0,
"entailment": 71.27010578764965,
"neutral": 12.857125006247813,
"contradiction": 15.872769206102538
},
"dolly": {
"abstain": 5.0,
"entailment": 84.05321978955268,
"neutral": 10.11703348341771,
"contradiction": 5.829746727029601
},
"avg": {
"abstain": 2.0,
"entailment": 61.51125617901144,
"neutral": 19.316616373781347,
"contradiction": 19.172127447207217
}
},
"ChatGLM 3 6B": {
"nq": {
"abstain": 6.0,
"entailment": 18.26618590698691,
"neutral": 34.06163480866234,
"contradiction": 47.67217928435074
},
"msmarco": {
"abstain": 1.0,
"entailment": 75.97180091246138,
"neutral": 8.206214380456805,
"contradiction": 15.82198470708181
},
"dolly": {
"abstain": 0.0,
"entailment": 88.08577812944205,
"neutral": 5.670668220668221,
"contradiction": 6.243553649889723
},
"avg": {
"abstain": 2.333333333333333,
"entailment": 61.59319992673943,
"neutral": 15.635753302888414,
"contradiction": 22.771046770372156
}
},
"GPT-3.5-Turbo": {
"nq": {
"abstain": 2.0,
"entailment": 63.29308580785488,
"neutral": 16.000023964578205,
"contradiction": 20.70689022756692
},
"msmarco": {
"abstain": 27.0,
"entailment": 79.9425347088522,
"neutral": 6.387141524127824,
"contradiction": 13.67032376701998
},
"dolly": {
"abstain": 1.0,
"entailment": 91.66077749411082,
"neutral": 4.603251686585019,
"contradiction": 3.7359708193041525
},
"avg": {
"abstain": 10.000000000000002,
"entailment": 78.19609042530726,
"neutral": 9.222168876895973,
"contradiction": 12.58174069779677
}
},
"Claude 2": {
"nq": {
"abstain": 4.0,
"entailment": 38.60519355808089,
"neutral": 39.31375041007514,
"contradiction": 22.08105603184398
},
"msmarco": {
"abstain": 5.0,
"entailment": 79.55886939360134,
"neutral": 8.077428019810291,
"contradiction": 12.363702586588362
},
"dolly": {
"abstain": 2.0,
"entailment": 94.08325518019396,
"neutral": 2.512476185945574,
"contradiction": 3.4042686338604704
},
"avg": {
"abstain": 3.6666666666666665,
"entailment": 70.88010443469517,
"neutral": 16.56643033727286,
"contradiction": 12.553465228031978
}
},
"InstructGPT": {
"nq": {
"abstain": 3.0,
"entailment": 28.028555064637533,
"neutral": 27.709495544547092,
"contradiction": 44.26194939081537
},
"msmarco": {
"abstain": 10.0,
"entailment": 54.946923446923456,
"neutral": 17.627391127391125,
"contradiction": 27.425685425685426
},
"dolly": {
"abstain": 1.0,
"entailment": 81.99237911359123,
"neutral": 7.622007622007622,
"contradiction": 10.385613264401144
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 55.17915553754715,
"neutral": 17.58344413938819,
"contradiction": 27.23740032306466
}
},
"Falcon 40B Instruct": {
"nq": {
"abstain": 27.0,
"entailment": 44.33789954337899,
"neutral": 15.570776255707763,
"contradiction": 40.09132420091324
},
"msmarco": {
"abstain": 24.0,
"entailment": 66.20383679787703,
"neutral": 11.801071305715269,
"contradiction": 21.995091896407686
},
"dolly": {
"abstain": 1.0,
"entailment": 81.55825095219035,
"neutral": 10.279242362575696,
"contradiction": 8.162506685233957
},
"avg": {
"abstain": 17.333333333333336,
"entailment": 65.89687543375872,
"neutral": 12.303197902403308,
"contradiction": 21.799926663837955
}
},
"Gemini Pro (API)\u2020": {
"nq": {
"abstain": 16.0,
"entailment": 49.76190476190476,
"neutral": 12.612433862433864,
"contradiction": 37.62566137566137
},
"msmarco": {
"abstain": 23.0,
"entailment": 81.59493284493284,
"neutral": 6.024531024531025,
"contradiction": 12.380536130536129
},
"dolly": {
"abstain": 20.0,
"entailment": 91.03472222222223,
"neutral": 6.576388888888888,
"contradiction": 2.388888888888889
},
"avg": {
"abstain": 19.666666666666668,
"entailment": 73.63314359683655,
"neutral": 8.503918856615952,
"contradiction": 17.862937546547506
}
},
"GPT-4": {
"nq": {
"abstain": 0.0,
"entailment": 75.47857142857143,
"neutral": 8.716269841269842,
"contradiction": 15.80515873015873
},
"msmarco": {
"abstain": 10.0,
"entailment": 86.34002541142354,
"neutral": 4.581828515332352,
"contradiction": 9.078146073244111
},
"dolly": {
"abstain": 4.0,
"entailment": 93.8433196483468,
"neutral": 3.9120239663717924,
"contradiction": 2.2446563852813854
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 85.060902503939,
"neutral": 5.8026078716034535,
"contradiction": 9.136489624457537
}
},
"GPT-4-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 47.704400991165684,
"neutral": 39.265920435773374,
"contradiction": 13.029678573060924
},
"msmarco": {
"abstain": 2.0,
"entailment": 80.59393261993657,
"neutral": 8.396740776595946,
"contradiction": 11.009326603467484
},
"dolly": {
"abstain": 2.0,
"entailment": 89.68750150981764,
"neutral": 6.792575888164124,
"contradiction": 3.519922602018235
},
"avg": {
"abstain": 1.3333333333333335,
"entailment": 72.49331298592054,
"neutral": 18.29440904298589,
"contradiction": 9.212277971093558
}
},
"InternLM 20B Chat": {
"nq": {
"abstain": 5.0,
"entailment": 23.576730287256602,
"neutral": 15.753550543024227,
"contradiction": 60.66971916971916
},
"msmarco": {
"abstain": 17.0,
"entailment": 70.72327404857525,
"neutral": 7.331803404092561,
"contradiction": 21.944922547332183
},
"dolly": {
"abstain": 4.0,
"entailment": 91.82043650793652,
"neutral": 4.0476190476190474,
"contradiction": 4.131944444444445
},
"avg": {
"abstain": 8.666666666666668,
"entailment": 61.7685511973833,
"neutral": 9.101089097439463,
"contradiction": 29.130359705177224
}
},
"LLaMA 2 7B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 20.165469685466057,
"neutral": 42.98696428313368,
"contradiction": 36.84756603140028
},
"msmarco": {
"abstain": 4.0,
"entailment": 75.30955082765344,
"neutral": 10.283691584742575,
"contradiction": 14.406757587603975
},
"dolly": {
"abstain": 2.0,
"entailment": 87.78724526773746,
"neutral": 7.746635683910593,
"contradiction": 4.466119048351942
},
"avg": {
"abstain": 2.3333333333333335,
"entailment": 60.85067718277865,
"neutral": 20.485031239552075,
"contradiction": 18.664291577669278
}
},
"LLaMA 2 13B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 26.07997401740508,
"neutral": 44.20038213505851,
"contradiction": 29.719643847536414
},
"msmarco": {
"abstain": 7.000000000000001,
"entailment": 75.77361866479563,
"neutral": 9.280276018391467,
"contradiction": 14.946105316812911
},
"dolly": {
"abstain": 1.0,
"entailment": 86.54677660467827,
"neutral": 7.574860515363949,
"contradiction": 5.878362879957776
},
"avg": {
"abstain": 3.0,
"entailment": 62.53262834162282,
"neutral": 20.580119216846153,
"contradiction": 16.887252441531018
}
},
"LLaMA 2 70B Chat": {
"nq": {
"abstain": 6.0,
"entailment": 33.07235038746468,
"neutral": 43.85382993047179,
"contradiction": 23.07381968206354
},
"msmarco": {
"abstain": 4.0,
"entailment": 80.47124980243186,
"neutral": 7.865185891001109,
"contradiction": 11.663564306567025
},
"dolly": {
"abstain": 0.0,
"entailment": 90.47362498612499,
"neutral": 5.87049062049062,
"contradiction": 3.6558843933843925
},
"avg": {
"abstain": 3.3333333333333335,
"entailment": 68.55656350368152,
"neutral": 18.842644555343156,
"contradiction": 12.600791940975334
}
},
"Mistral 7B Instruct": {
"nq": {
"abstain": 1.0,
"entailment": 26.234458582943425,
"neutral": 31.157024793388427,
"contradiction": 42.608516623668145
},
"msmarco": {
"abstain": 7.000000000000001,
"entailment": 76.17962142979619,
"neutral": 9.420955037653329,
"contradiction": 14.399423532550468
},
"dolly": {
"abstain": 0.0,
"entailment": 87.43171194130947,
"neutral": 7.781636028230454,
"contradiction": 4.78665203046008
},
"avg": {
"abstain": 2.666666666666667,
"entailment": 63.099614338402034,
"neutral": 16.228965328322804,
"contradiction": 20.67142033327516
}
},
"ERNIE Bot 4.0 (\u6587\u5fc3\u4e00\u8a004.0)": {
"nq": {
"abstain": 17.0,
"entailment": 40.723308483646505,
"neutral": 28.522805470121586,
"contradiction": 30.753886046231898
},
"msmarco": {
"abstain": 3.0,
"entailment": 79.98144476863116,
"neutral": 8.58845965924939,
"contradiction": 11.430095572119445
},
"dolly": {
"abstain": 0.0,
"entailment": 92.04622697563875,
"neutral": 4.316796112384348,
"contradiction": 3.6369769119769115
},
"avg": {
"abstain": 6.666666666666667,
"entailment": 72.65306230094201,
"neutral": 12.971975186448992,
"contradiction": 14.374962512609017
}
},
"Gemini Pro (Bard)*": {
"nq": {
"abstain": 2.0,
"entailment": 42.163575617117935,
"neutral": 36.72106317841535,
"contradiction": 21.115361204466705
},
"msmarco": {
"abstain": 10.0,
"entailment": 65.13759465932725,
"neutral": 18.44282241763677,
"contradiction": 16.419582923035982
},
"dolly": {
"abstain": 2.0,
"entailment": 82.61033199548214,
"neutral": 12.206766981099369,
"contradiction": 5.182901023418466
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 63.25254008872119,
"neutral": 22.5691656406285,
"contradiction": 14.1782942706503
}
},
"Phi-2": {
"nq": {
"abstain": 3.0,
"entailment": 20.55665158909549,
"neutral": 25.96141611908744,
"contradiction": 53.48193229181707
},
"msmarco": {
"abstain": 18.0,
"entailment": 67.97196908782274,
"neutral": 5.817229518449031,
"contradiction": 26.21080139372822
},
"dolly": {
"abstain": 3.0,
"entailment": 84.77256255240052,
"neutral": 9.687282540108704,
"contradiction": 5.5401549074907726
},
"avg": {
"abstain": 8.0,
"entailment": 57.21244651060354,
"neutral": 14.257016632082776,
"contradiction": 28.530536857313678
}
}
},
"claude2###nli": {
"Alpaca 7B": {
"nq": {
"abstain": 14.000000000000002,
"entailment": 43.753817431998,
"neutral": 38.88117354601622,
"contradiction": 17.36500902198577
},
"msmarco": {
"abstain": 2.0,
"entailment": 85.0101230968578,
"neutral": 5.3146258503401365,
"contradiction": 9.675251052802073
},
"dolly": {
"abstain": 8.0,
"entailment": 89.82052669552671,
"neutral": 3.7306292741075353,
"contradiction": 6.448844030365769
},
"avg": {
"abstain": 8.0,
"entailment": 73.75836528489981,
"neutral": 15.245768664886311,
"contradiction": 10.995866050213877
}
},
"Baichuan 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 46.60804809225861,
"neutral": 33.622420123735914,
"contradiction": 19.76953178400547
},
"msmarco": {
"abstain": 1.0,
"entailment": 89.83985150651819,
"neutral": 5.972237638904306,
"contradiction": 4.187910854577521
},
"dolly": {
"abstain": 5.0,
"entailment": 89.98259998198893,
"neutral": 5.461597633261312,
"contradiction": 4.555802384749753
},
"avg": {
"abstain": 2.0,
"entailment": 75.18128267571466,
"neutral": 15.212058890424975,
"contradiction": 9.606658433860368
}
},
"ChatGLM 3 6B": {
"nq": {
"abstain": 6.0,
"entailment": 40.78771579553808,
"neutral": 39.33744968006545,
"contradiction": 19.874834524396473
},
"msmarco": {
"abstain": 1.0,
"entailment": 94.4033772064075,
"neutral": 2.7214059789817364,
"contradiction": 2.875216814610754
},
"dolly": {
"abstain": 0.0,
"entailment": 91.79145299145299,
"neutral": 4.430769230769231,
"contradiction": 3.7777777777777777
},
"avg": {
"abstain": 2.333333333333333,
"entailment": 76.31100657802124,
"neutral": 15.051933054342209,
"contradiction": 8.637060367636558
}
},
"GPT-3.5-Turbo": {
"nq": {
"abstain": 2.0,
"entailment": 70.24307071970874,
"neutral": 18.845966958211854,
"contradiction": 10.9109623220794
},
"msmarco": {
"abstain": 27.0,
"entailment": 93.63130780939002,
"neutral": 3.2496194824961946,
"contradiction": 3.1190727081138037
},
"dolly": {
"abstain": 1.0,
"entailment": 94.37582479249146,
"neutral": 2.4633237133237134,
"contradiction": 3.160851494184828
},
"avg": {
"abstain": 10.000000000000002,
"entailment": 85.41523353730956,
"neutral": 8.622207524985303,
"contradiction": 5.962558937705136
}
},
"Claude 2": {
"nq": {
"abstain": 4.0,
"entailment": 49.358368744934836,
"neutral": 42.2805749530059,
"contradiction": 8.361056302059255
},
"msmarco": {
"abstain": 5.0,
"entailment": 90.45094618053719,
"neutral": 3.8998626479887686,
"contradiction": 5.649191171474046
},
"dolly": {
"abstain": 2.0,
"entailment": 93.38884471537531,
"neutral": 2.951453308596166,
"contradiction": 3.6597019760285066
},
"avg": {
"abstain": 3.6666666666666665,
"entailment": 77.79705906149329,
"neutral": 16.327559070207354,
"contradiction": 5.875381868299366
}
},
"InstructGPT": {
"nq": {
"abstain": 3.0,
"entailment": 40.96019101173741,
"neutral": 33.48901384983859,
"contradiction": 25.55079513842401
},
"msmarco": {
"abstain": 10.0,
"entailment": 88.10163910163911,
"neutral": 4.191290191290191,
"contradiction": 7.707070707070707
},
"dolly": {
"abstain": 1.0,
"entailment": 91.71777296777297,
"neutral": 4.379910213243546,
"contradiction": 3.9023168189834854
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 73.3648446541803,
"neutral": 14.193222278886616,
"contradiction": 12.441933066933068
}
},
"Falcon 40B Instruct": {
"nq": {
"abstain": 27.0,
"entailment": 58.81278538812785,
"neutral": 12.100456621004565,
"contradiction": 29.08675799086758
},
"msmarco": {
"abstain": 24.0,
"entailment": 87.42769914983538,
"neutral": 4.267961570593148,
"contradiction": 8.304339279571478
},
"dolly": {
"abstain": 1.0,
"entailment": 87.37540056167506,
"neutral": 8.171889838556506,
"contradiction": 4.452709599768423
},
"avg": {
"abstain": 17.333333333333336,
"entailment": 78.9838835658333,
"neutral": 8.131917365788333,
"contradiction": 12.884199068378383
}
},
"Gemini Pro (API)\u2020": {
"nq": {
"abstain": 16.0,
"entailment": 63.9616402116402,
"neutral": 27.003968253968253,
"contradiction": 9.034391534391533
},
"msmarco": {
"abstain": 23.0,
"entailment": 95.88383838383837,
"neutral": 0.2922077922077922,
"contradiction": 3.8239538239538233
},
"dolly": {
"abstain": 20.0,
"entailment": 95.04861111111111,
"neutral": 2.722222222222222,
"contradiction": 2.2291666666666665
},
"avg": {
"abstain": 19.666666666666668,
"entailment": 84.48017519594283,
"neutral": 10.409174734900875,
"contradiction": 5.110650069156293
}
},
"GPT-4": {
"nq": {
"abstain": 0.0,
"entailment": 75.94603174603174,
"neutral": 17.39563492063492,
"contradiction": 6.658333333333333
},
"msmarco": {
"abstain": 10.0,
"entailment": 96.2037037037037,
"neutral": 1.1111111111111112,
"contradiction": 2.685185185185185
},
"dolly": {
"abstain": 4.0,
"entailment": 93.81820436507935,
"neutral": 2.34375,
"contradiction": 3.8380456349206353
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 88.31987456987457,
"neutral": 7.218753468753468,
"contradiction": 4.461371961371961
}
},
"GPT-4-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 57.852571121688754,
"neutral": 36.04099674834969,
"contradiction": 6.106432129961542
},
"msmarco": {
"abstain": 2.0,
"entailment": 93.16440249093311,
"neutral": 2.3344155844155843,
"contradiction": 4.501181924651313
},
"dolly": {
"abstain": 2.0,
"entailment": 90.97237329830365,
"neutral": 5.412739188249392,
"contradiction": 3.614887513446937
},
"avg": {
"abstain": 1.3333333333333335,
"entailment": 80.50899033619622,
"neutral": 14.740948792419378,
"contradiction": 4.750060871384401
}
},
"InternLM 20B Chat": {
"nq": {
"abstain": 5.0,
"entailment": 47.806631964526694,
"neutral": 36.29053402737613,
"contradiction": 15.902834008097166
},
"msmarco": {
"abstain": 17.0,
"entailment": 96.56961178045515,
"neutral": 0.6024096385542169,
"contradiction": 2.8279785809906293
},
"dolly": {
"abstain": 4.0,
"entailment": 94.10389957264957,
"neutral": 2.690972222222222,
"contradiction": 3.205128205128205
},
"avg": {
"abstain": 8.666666666666668,
"entailment": 78.79884004884005,
"neutral": 13.707788561803161,
"contradiction": 7.49337138935679
}
},
"LLaMA 2 7B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 39.283824307910905,
"neutral": 43.16653367393302,
"contradiction": 17.549642018156074
},
"msmarco": {
"abstain": 4.0,
"entailment": 90.16009605803403,
"neutral": 5.1082535117861205,
"contradiction": 4.731650430179842
},
"dolly": {
"abstain": 2.0,
"entailment": 89.94203742102901,
"neutral": 6.747904872904874,
"contradiction": 3.310057706066109
},
"avg": {
"abstain": 2.3333333333333335,
"entailment": 72.89688564954024,
"neutral": 18.515951701008575,
"contradiction": 8.587162649451177
}
},
"LLaMA 2 13B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 43.25385499356581,
"neutral": 41.48118537755674,
"contradiction": 15.264959628877445
},
"msmarco": {
"abstain": 7.000000000000001,
"entailment": 89.29397828166928,
"neutral": 4.993593465579884,
"contradiction": 5.712428252750834
},
"dolly": {
"abstain": 1.0,
"entailment": 91.96779931628419,
"neutral": 4.387528857225827,
"contradiction": 3.644671826490008
},
"avg": {
"abstain": 3.0,
"entailment": 74.54049400986389,
"neutral": 17.20071100186393,
"contradiction": 8.258794988272184
}
},
"LLaMA 2 70B Chat": {
"nq": {
"abstain": 6.0,
"entailment": 46.263535689767494,
"neutral": 44.118257573143616,
"contradiction": 9.618206737088892
},
"msmarco": {
"abstain": 4.0,
"entailment": 91.34825127318335,
"neutral": 3.8664840719867892,
"contradiction": 4.785264654829872
},
"dolly": {
"abstain": 0.0,
"entailment": 91.30055663290958,
"neutral": 4.725811606693959,
"contradiction": 3.9736317603964655
},
"avg": {
"abstain": 3.3333333333333335,
"entailment": 76.71813841501623,
"neutral": 17.209930494674577,
"contradiction": 6.071931090309208
}
},
"Mistral 7B Instruct": {
"nq": {
"abstain": 1.0,
"entailment": 42.53210313816374,
"neutral": 35.6631976328946,
"contradiction": 21.80469922894165
},
"msmarco": {
"abstain": 7.000000000000001,
"entailment": 91.60184036686249,
"neutral": 4.9421711771490395,
"contradiction": 3.455988455988456
},
"dolly": {
"abstain": 0.0,
"entailment": 92.34455560779091,
"neutral": 4.48907383466207,
"contradiction": 3.1663705575470282
},
"avg": {
"abstain": 2.666666666666667,
"entailment": 75.2195374163545,
"neutral": 15.202691330813812,
"contradiction": 9.577771252831688
}
},
"ERNIE Bot 4.0 (\u6587\u5fc3\u4e00\u8a004.0)": {
"nq": {
"abstain": 17.0,
"entailment": 56.353780353795756,
"neutral": 31.15051824547094,
"contradiction": 12.495701400733292
},
"msmarco": {
"abstain": 3.0,
"entailment": 92.04375044963112,
"neutral": 3.905952859764579,
"contradiction": 4.050296690604317
},
"dolly": {
"abstain": 0.0,
"entailment": 93.20912698412698,
"neutral": 3.3912698412698417,
"contradiction": 3.3996031746031745
},
"avg": {
"abstain": 6.666666666666667,
"entailment": 81.8804295049713,
"neutral": 11.798205092493703,
"contradiction": 6.321365402534999
}
},
"Gemini Pro (Bard)*": {
"nq": {
"abstain": 2.0,
"entailment": 52.58517727956774,
"neutral": 38.94160858250274,
"contradiction": 8.47321413792953
},
"msmarco": {
"abstain": 10.0,
"entailment": 79.42988099007772,
"neutral": 11.111471761758365,
"contradiction": 9.458647248163933
},
"dolly": {
"abstain": 2.0,
"entailment": 85.78617082162448,
"neutral": 9.282303892866539,
"contradiction": 4.931525285508984
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 72.40937553504835,
"neutral": 20.020894689316233,
"contradiction": 7.5697297756354125
}
},
"Phi-2": {
"nq": {
"abstain": 3.0,
"entailment": 42.40528607569238,
"neutral": 31.310681098431242,
"contradiction": 26.28403282587637
},
"msmarco": {
"abstain": 18.0,
"entailment": 89.76480836236934,
"neutral": 2.6132404181184667,
"contradiction": 7.621951219512195
},
"dolly": {
"abstain": 3.0,
"entailment": 89.4985599319208,
"neutral": 6.62513696952019,
"contradiction": 3.8763030985590112
},
"avg": {
"abstain": 8.0,
"entailment": 73.02676575526364,
"neutral": 14.108913285786246,
"contradiction": 12.864320958950112
}
}
},
"gpt4###ensemble": {
"Alpaca 7B": {
"nq": {
"abstain": 6.0,
"entailment": 17.630373468139425,
"neutral": 52.47881967844421,
"contradiction": 29.89080685341637
},
"msmarco": {
"abstain": 0.0,
"entailment": 65.9745670995671,
"neutral": 12.682142857142855,
"contradiction": 21.343290043290043
},
"dolly": {
"abstain": 6.0,
"entailment": 85.27159274499698,
"neutral": 6.905854113300921,
"contradiction": 7.822553141702078
},
"avg": {
"abstain": 4.0,
"entailment": 56.493894215248375,
"neutral": 23.78601952131364,
"contradiction": 19.72008626343798
}
},
"Baichuan 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 29.46922244422244,
"neutral": 39.1429057054057,
"contradiction": 31.387871850371845
},
"msmarco": {
"abstain": 0.0,
"entailment": 84.6811115355233,
"neutral": 10.89693362193362,
"contradiction": 4.421954842543078
},
"dolly": {
"abstain": 4.0,
"entailment": 89.4821338961964,
"neutral": 7.672657203907204,
"contradiction": 2.8452088998964
},
"avg": {
"abstain": 1.3333333333333335,
"entailment": 67.58553463516698,
"neutral": 19.393780487530485,
"contradiction": 13.020684877302525
}
},
"ChatGLM 3 6B": {
"nq": {
"abstain": 1.0,
"entailment": 17.956616896010832,
"neutral": 44.23574296568949,
"contradiction": 37.80764013829968
},
"msmarco": {
"abstain": 0.0,
"entailment": 88.15196886446887,
"neutral": 6.351526251526252,
"contradiction": 5.4965048840048825
},
"dolly": {
"abstain": 0.0,
"entailment": 90.75256849623074,
"neutral": 2.8773445629849803,
"contradiction": 6.370086940784285
},
"avg": {
"abstain": 0.33333333333333337,
"entailment": 65.77979534707369,
"neutral": 17.73319610386081,
"contradiction": 16.4870085490655
}
},
"GPT-3.5-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 61.85544733044733,
"neutral": 18.538924963924963,
"contradiction": 19.605627705627704
},
"msmarco": {
"abstain": 0.0,
"entailment": 75.2887955182073,
"neutral": 6.535014005602241,
"contradiction": 18.176190476190474
},
"dolly": {
"abstain": 0.0,
"entailment": 95.43484848484849,
"neutral": 2.5651515151515154,
"contradiction": 2.0
},
"avg": {
"abstain": 0.0,
"entailment": 77.52636377783436,
"neutral": 9.213030161559573,
"contradiction": 13.260606060606062
}
},
"Claude 2": {
"nq": {
"abstain": 0.0,
"entailment": 40.02383455378944,
"neutral": 48.03262041358283,
"contradiction": 11.943545032627718
},
"msmarco": {
"abstain": 0.0,
"entailment": 88.78287717184774,
"neutral": 8.112981136510548,
"contradiction": 3.1041416916416917
},
"dolly": {
"abstain": 0.0,
"entailment": 95.06695997239476,
"neutral": 3.759555179120397,
"contradiction": 1.1734848484848486
},
"avg": {
"abstain": 0.0,
"entailment": 74.62455723267732,
"neutral": 19.96838557640459,
"contradiction": 5.407057190918085
}
},
"InstructGPT": {
"nq": {
"abstain": 0.0,
"entailment": 20.613888888888887,
"neutral": 24.323232323232325,
"contradiction": 55.06287878787878
},
"msmarco": {
"abstain": 0.0,
"entailment": 70.83974358974359,
"neutral": 12.616666666666667,
"contradiction": 16.54358974358974
},
"dolly": {
"abstain": 0.0,
"entailment": 85.98766511266511,
"neutral": 4.345454545454545,
"contradiction": 9.666880341880344
},
"avg": {
"abstain": 0.0,
"entailment": 59.147099197099195,
"neutral": 13.76178451178451,
"contradiction": 27.09111629111629
}
},
"Falcon 40B Instruct": {
"nq": {
"abstain": 0.0,
"entailment": 33.08333333333333,
"neutral": 25.683333333333337,
"contradiction": 41.233333333333334
},
"msmarco": {
"abstain": 0.0,
"entailment": 69.2890873015873,
"neutral": 12.878373015873015,
"contradiction": 17.832539682539682
},
"dolly": {
"abstain": 1.0,
"entailment": 84.19342161766404,
"neutral": 10.701525132121418,
"contradiction": 5.1050532502145405
},
"avg": {
"abstain": 0.33333333333333337,
"entailment": 62.115019410169914,
"neutral": 16.440206096992156,
"contradiction": 21.444774492837933
}
},
"Gemini Pro (API)\u2020": {
"nq": {
"abstain": 16.0,
"entailment": 46.48526077097506,
"neutral": 10.870181405895691,
"contradiction": 42.64455782312925
},
"msmarco": {
"abstain": 5.0,
"entailment": 85.34252297410193,
"neutral": 4.156223893065999,
"contradiction": 10.501253132832082
},
"dolly": {
"abstain": 21.0,
"entailment": 91.22362869198312,
"neutral": 5.822784810126583,
"contradiction": 2.953586497890295
},
"avg": {
"abstain": 14.000000000000002,
"entailment": 74.49212501538082,
"neutral": 6.852467085025225,
"contradiction": 18.655407899593946
}
},
"Gemini Pro (Bard)*": {
"nq": {
"abstain": 0.0,
"entailment": 42.3192079188815,
"neutral": 49.54031618891121,
"contradiction": 8.14047589220728
},
"msmarco": {
"abstain": 0.0,
"entailment": 72.08984085831608,
"neutral": 22.679601191721932,
"contradiction": 5.230557949961974
},
"dolly": {
"abstain": 0.0,
"entailment": 85.03484607896542,
"neutral": 11.418896529188963,
"contradiction": 3.546257391845627
},
"avg": {
"abstain": 0.0,
"entailment": 66.48129828538767,
"neutral": 27.879604636607375,
"contradiction": 5.63909707800496
}
},
"GPT-4": {
"nq": {
"abstain": 0.0,
"entailment": 76.50367965367965,
"neutral": 10.33073593073593,
"contradiction": 13.165584415584416
},
"msmarco": {
"abstain": 0.0,
"entailment": 90.81666666666666,
"neutral": 6.011904761904762,
"contradiction": 3.1714285714285717
},
"dolly": {
"abstain": 0.0,
"entailment": 94.35,
"neutral": 1.874242424242424,
"contradiction": 3.7757575757575754
},
"avg": {
"abstain": 0.0,
"entailment": 87.22344877344878,
"neutral": 6.072294372294372,
"contradiction": 6.704256854256855
}
},
"GPT-4-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 47.53210942936007,
"neutral": 46.04064378361053,
"contradiction": 6.427246787029395
},
"msmarco": {
"abstain": 0.0,
"entailment": 91.68919342611292,
"neutral": 3.9656392412197365,
"contradiction": 4.345167332667333
},
"dolly": {
"abstain": 0.0,
"entailment": 93.33216301672185,
"neutral": 4.861454938934998,
"contradiction": 1.8063820443431613
},
"avg": {
"abstain": 0.0,
"entailment": 77.5178219573983,
"neutral": 18.28924598792176,
"contradiction": 4.1929320546799635
}
},
"InternLM 20B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 17.36546562304138,
"neutral": 22.48258195227892,
"contradiction": 60.1519524246797
},
"msmarco": {
"abstain": 0.0,
"entailment": 71.97857142857143,
"neutral": 9.125,
"contradiction": 18.896428571428572
},
"dolly": {
"abstain": 1.0,
"entailment": 95.89466089466089,
"neutral": 0.7215007215007215,
"contradiction": 3.383838383838384
},
"avg": {
"abstain": 0.6666666666666667,
"entailment": 61.78056935607271,
"neutral": 10.770819411759009,
"contradiction": 27.448611232168275
}
},
"LLaMA 2 7B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 17.863238299654707,
"neutral": 57.486501904329,
"contradiction": 24.65025979601629
},
"msmarco": {
"abstain": 0.0,
"entailment": 84.42618947959507,
"neutral": 10.243142272244437,
"contradiction": 5.330668248160508
},
"dolly": {
"abstain": 0.0,
"entailment": 91.5319832944833,
"neutral": 5.90772422022422,
"contradiction": 2.5602924852924853
},
"avg": {
"abstain": 0.0,
"entailment": 64.60713702457768,
"neutral": 24.545789465599224,
"contradiction": 10.847073509823096
}
},
"LLaMA 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 27.52490646608294,
"neutral": 52.27585404791286,
"contradiction": 20.199239486004195
},
"msmarco": {
"abstain": 0.0,
"entailment": 84.40955523234933,
"neutral": 10.119819151436797,
"contradiction": 5.470625616213851
},
"dolly": {
"abstain": 0.0,
"entailment": 90.67812160062158,
"neutral": 6.539147241647241,
"contradiction": 2.782731157731158
},
"avg": {
"abstain": 0.0,
"entailment": 67.53752776635127,
"neutral": 22.978273480332305,
"contradiction": 9.484198753316402
}
},
"LLaMA 2 70B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 29.7032482610269,
"neutral": 56.06171349720112,
"contradiction": 14.235038241771985
},
"msmarco": {
"abstain": 0.0,
"entailment": 86.5586607836608,
"neutral": 7.181184093684094,
"contradiction": 6.260155122655123
},
"dolly": {
"abstain": 0.0,
"entailment": 94.01765873015874,
"neutral": 3.076388888888889,
"contradiction": 2.9059523809523813
},
"avg": {
"abstain": 0.0,
"entailment": 70.09318925828215,
"neutral": 22.106428826591365,
"contradiction": 7.800381915126496
}
},
"Mistral 7B Instruct": {
"nq": {
"abstain": 0.0,
"entailment": 24.200396825396826,
"neutral": 38.8416305916306,
"contradiction": 36.95797258297258
},
"msmarco": {
"abstain": 0.0,
"entailment": 86.83447945816367,
"neutral": 5.975154475154475,
"contradiction": 7.1903660666818565
},
"dolly": {
"abstain": 0.0,
"entailment": 92.95665559930266,
"neutral": 3.7103216263702192,
"contradiction": 3.333022774327122
},
"avg": {
"abstain": 0.0,
"entailment": 67.99717729428772,
"neutral": 16.175702231051762,
"contradiction": 15.827120474660521
}
},
"ERNIE Bot 4.0 (\u6587\u5fc3\u4e00\u8a004.0)": {
"nq": {
"abstain": 9.0,
"entailment": 37.421324097148265,
"neutral": 39.20295211504003,
"contradiction": 23.375723787811705
},
"msmarco": {
"abstain": 0.0,
"entailment": 90.7903847184807,
"neutral": 7.111563333467358,
"contradiction": 2.098051948051948
},
"dolly": {
"abstain": 0.0,
"entailment": 95.01819112260289,
"neutral": 2.427474323062558,
"contradiction": 2.554334554334554
},
"avg": {
"abstain": 3.0,
"entailment": 75.55394528161116,
"neutral": 15.53736222722211,
"contradiction": 8.908692491166718
}
},
"Phi-2": {
"nq": {
"abstain": 0.0,
"entailment": 15.346622495151907,
"neutral": 30.171047906342025,
"contradiction": 54.48232959850608
},
"msmarco": {
"abstain": 1.0,
"entailment": 70.03491400043124,
"neutral": 5.658528359677784,
"contradiction": 24.306557639890972
},
"dolly": {
"abstain": 1.0,
"entailment": 87.42571822117277,
"neutral": 6.796859145343993,
"contradiction": 5.777422633483241
},
"avg": {
"abstain": 0.6666666666666667,
"entailment": 57.46062026662412,
"neutral": 14.262376354467648,
"contradiction": 28.27700337890824
}
}
},
"claude2###ensemble": {
"Alpaca 7B": {
"nq": {
"abstain": 14.000000000000002,
"entailment": 21.241871253841158,
"neutral": 49.81864284446363,
"contradiction": 28.9394859016952
},
"msmarco": {
"abstain": 2.0,
"entailment": 64.45051830255912,
"neutral": 14.173550372529963,
"contradiction": 21.375931324910912
},
"dolly": {
"abstain": 8.0,
"entailment": 84.25871682665161,
"neutral": 7.055512422360248,
"contradiction": 8.685770750988143
},
"avg": {
"abstain": 8.0,
"entailment": 57.5896872084532,
"neutral": 22.90767523184403,
"contradiction": 19.50263755970278
}
},
"Baichuan 2 13B Chat": {
"nq": {
"abstain": 0.0,
"entailment": 28.73793238924818,
"neutral": 40.935400856453484,
"contradiction": 30.32666675429833
},
"msmarco": {
"abstain": 1.0,
"entailment": 83.65797782464449,
"neutral": 9.18279251612585,
"contradiction": 7.15922965922966
},
"dolly": {
"abstain": 5.0,
"entailment": 89.22320636472502,
"neutral": 7.84657984321908,
"contradiction": 2.9302137920558975
},
"avg": {
"abstain": 2.0,
"entailment": 66.77597839528401,
"neutral": 19.551230033495308,
"contradiction": 13.67279157122068
}
},
"ChatGLM 3 6B": {
"nq": {
"abstain": 6.0,
"entailment": 17.90577755925065,
"neutral": 48.04422058958979,
"contradiction": 34.05000185115955
},
"msmarco": {
"abstain": 1.0,
"entailment": 86.7038283944185,
"neutral": 6.39873594419049,
"contradiction": 6.897435661391005
},
"dolly": {
"abstain": 0.0,
"entailment": 92.25838656675637,
"neutral": 3.3035958924116824,
"contradiction": 4.438017540831942
},
"avg": {
"abstain": 2.333333333333333,
"entailment": 66.527852417381,
"neutral": 18.70304158067395,
"contradiction": 14.769106001945056
}
},
"GPT-3.5-Turbo": {
"nq": {
"abstain": 2.0,
"entailment": 63.65169336571055,
"neutral": 19.157318692764452,
"contradiction": 17.190987941524995
},
"msmarco": {
"abstain": 27.0,
"entailment": 91.13882059087538,
"neutral": 3.5897435897435894,
"contradiction": 5.271435819381025
},
"dolly": {
"abstain": 1.0,
"entailment": 96.6845037678371,
"neutral": 1.236772486772487,
"contradiction": 2.0787237453904117
},
"avg": {
"abstain": 10.000000000000002,
"entailment": 83.19542861477558,
"neutral": 8.37744070419509,
"contradiction": 8.427130681029317
}
},
"Claude 2": {
"nq": {
"abstain": 4.0,
"entailment": 37.258990243046604,
"neutral": 51.60815620775462,
"contradiction": 11.132853549198769
},
"msmarco": {
"abstain": 5.0,
"entailment": 88.64898350522435,
"neutral": 7.009393486132937,
"contradiction": 4.341623008642725
},
"dolly": {
"abstain": 2.0,
"entailment": 96.91521913460689,
"neutral": 1.7088127292208926,
"contradiction": 1.3759681361722178
},
"avg": {
"abstain": 3.6666666666666665,
"entailment": 74.38134246200782,
"neutral": 20.0267786318018,
"contradiction": 5.591878906190374
}
},
"InstructGPT": {
"nq": {
"abstain": 3.0,
"entailment": 26.36761577483227,
"neutral": 29.128025468231655,
"contradiction": 44.50435875693607
},
"msmarco": {
"abstain": 10.0,
"entailment": 71.43980093980093,
"neutral": 11.610704110704113,
"contradiction": 16.94949494949495
},
"dolly": {
"abstain": 1.0,
"entailment": 87.37298195631529,
"neutral": 4.285914702581369,
"contradiction": 8.341103341103342
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 61.66841268676932,
"neutral": 15.016389496284601,
"contradiction": 23.315197816946068
}
},
"Falcon 40B Instruct": {
"nq": {
"abstain": 27.0,
"entailment": 44.65753424657534,
"neutral": 10.627853881278538,
"contradiction": 44.714611872146115
},
"msmarco": {
"abstain": 24.0,
"entailment": 71.2170131210379,
"neutral": 11.690255049388174,
"contradiction": 17.092731829573932
},
"dolly": {
"abstain": 1.0,
"entailment": 84.1018395185062,
"neutral": 10.461837545170878,
"contradiction": 5.436322936322936
},
"avg": {
"abstain": 17.333333333333336,
"entailment": 68.54264157068948,
"neutral": 10.887155782494967,
"contradiction": 20.570202646815545
}
},
"Gemini Pro (API)\u2020": {
"nq": {
"abstain": 16.0,
"entailment": 48.902116402116405,
"neutral": 12.36111111111111,
"contradiction": 38.736772486772495
},
"msmarco": {
"abstain": 23.0,
"entailment": 90.06715506715507,
"neutral": 2.762237762237762,
"contradiction": 7.1706071706071715
},
"dolly": {
"abstain": 20.0,
"entailment": 94.74305555555557,
"neutral": 4.381944444444445,
"contradiction": 0.8749999999999999
},
"avg": {
"abstain": 19.666666666666668,
"entailment": 77.2713409227932,
"neutral": 6.645565131042308,
"contradiction": 16.083093946164485
}
},
"Gemini Pro (Bard)*": {
"nq": {
"abstain": 2.0,
"entailment": 40.9417668484882,
"neutral": 49.41088946712112,
"contradiction": 9.647343684390677
},
"msmarco": {
"abstain": 10.0,
"entailment": 73.31915173130437,
"neutral": 19.394666167025058,
"contradiction": 7.286182101670577
},
"dolly": {
"abstain": 2.0,
"entailment": 84.99209899446048,
"neutral": 12.136500863731229,
"contradiction": 2.871400141808305
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 66.22462415533693,
"neutral": 27.192881844250998,
"contradiction": 6.58249400041207
}
},
"GPT-4": {
"nq": {
"abstain": 0.0,
"entailment": 77.38928571428572,
"neutral": 13.027380952380952,
"contradiction": 9.583333333333332
},
"msmarco": {
"abstain": 10.0,
"entailment": 97.28601137424667,
"neutral": 1.8152958152958154,
"contradiction": 0.8986928104575163
},
"dolly": {
"abstain": 4.0,
"entailment": 97.83887987012987,
"neutral": 1.3541666666666667,
"contradiction": 0.8069534632034631
},
"avg": {
"abstain": 4.666666666666667,
"entailment": 90.51469252672462,
"neutral": 5.580820694457058,
"contradiction": 3.90448677881833
}
},
"GPT-4-Turbo": {
"nq": {
"abstain": 0.0,
"entailment": 45.17507655742949,
"neutral": 49.37076461120579,
"contradiction": 5.454158831364714
},
"msmarco": {
"abstain": 2.0,
"entailment": 92.14598100312386,
"neutral": 4.956777349634493,
"contradiction": 2.8972416472416467
},
"dolly": {
"abstain": 2.0,
"entailment": 94.14953896976706,
"neutral": 4.708575704674144,
"contradiction": 1.141885325558795
},
"avg": {
"abstain": 1.3333333333333335,
"entailment": 76.94077234150764,
"neutral": 19.879327906901434,
"contradiction": 3.1798997515909284
}
},
"InternLM 20B Chat": {
"nq": {
"abstain": 5.0,
"entailment": 23.029946661525607,
"neutral": 21.126309363151467,
"contradiction": 55.843743975322916
},
"msmarco": {
"abstain": 17.0,
"entailment": 77.90208452859055,
"neutral": 4.962707974756167,
"contradiction": 17.13520749665328
},
"dolly": {
"abstain": 4.0,
"entailment": 95.14136904761905,
"neutral": 2.775297619047619,
"contradiction": 2.083333333333333
},
"avg": {
"abstain": 8.666666666666668,
"entailment": 64.91711451565466,
"neutral": 9.800484389900449,
"contradiction": 25.28240109444489
}
},
"LLaMA 2 7B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 17.622607560326042,
"neutral": 57.55738452779022,
"contradiction": 24.82000791188373
},
"msmarco": {
"abstain": 4.0,
"entailment": 84.32195584339527,
"neutral": 9.302483087949039,
"contradiction": 6.375561068655698
},
"dolly": {
"abstain": 2.0,
"entailment": 89.80463647330396,
"neutral": 6.685943108512136,
"contradiction": 3.5094204181839226
},
"avg": {
"abstain": 2.3333333333333335,
"entailment": 63.619113596662146,
"neutral": 24.731883513066656,
"contradiction": 11.649002890271198
}
},
"LLaMA 2 13B Chat": {
"nq": {
"abstain": 1.0,
"entailment": 24.459323997575627,
"neutral": 57.05990065271198,
"contradiction": 18.4807753497124
},
"msmarco": {
"abstain": 7.000000000000001,
"entailment": 83.78834403664625,
"neutral": 11.065448181322541,
"contradiction": 5.146207782031211
},
"dolly": {
"abstain": 1.0,
"entailment": 91.25159415933285,
"neutral": 5.354805525216732,
"contradiction": 3.393600315450395
},
"avg": {
"abstain": 3.0,
"entailment": 66.14328829189705,
"neutral": 24.770249458687072,
"contradiction": 9.086462249415874
}
},
"LLaMA 2 70B Chat": {
"nq": {
"abstain": 6.0,
"entailment": 29.703146161087002,
"neutral": 58.07392593591626,
"contradiction": 12.22292790299674
},
"msmarco": {
"abstain": 4.0,
"entailment": 87.83553858078314,
"neutral": 9.12421831137864,
"contradiction": 3.040243107838216
},
"dolly": {
"abstain": 0.0,
"entailment": 94.5539751914752,
"neutral": 2.815133477633478,
"contradiction": 2.630891330891331
},
"avg": {
"abstain": 3.3333333333333335,
"entailment": 71.30932745532718,
"neutral": 22.815128771144224,
"contradiction": 5.875543773528604
}
},
"Mistral 7B Instruct": {
"nq": {
"abstain": 1.0,
"entailment": 25.40773900622385,
"neutral": 38.96468290407684,
"contradiction": 35.6275780896993
},
"msmarco": {
"abstain": 7.000000000000001,
"entailment": 85.77285591279899,
"neutral": 7.781711781553653,
"contradiction": 6.44543230564736
},
"dolly": {
"abstain": 0.0,
"entailment": 91.99693350165488,
"neutral": 5.269511267885881,
"contradiction": 2.7335552304592547
},
"avg": {
"abstain": 2.666666666666667,
"entailment": 67.43813394408204,
"neutral": 17.49367784238591,
"contradiction": 15.06818821353206
}
},
"ERNIE Bot 4.0 (\u6587\u5fc3\u4e00\u8a004.0)": {
"nq": {
"abstain": 17.0,
"entailment": 40.3128901692979,
"neutral": 36.4439475673414,
"contradiction": 23.243162263360706
},
"msmarco": {
"abstain": 3.0,
"entailment": 89.3060707172859,
"neutral": 8.29678943576417,
"contradiction": 2.397139846949939
},
"dolly": {
"abstain": 0.0,
"entailment": 95.77377344877344,
"neutral": 2.5912698412698414,
"contradiction": 1.63495670995671
},
"avg": {
"abstain": 6.666666666666667,
"entailment": 77.09298603037786,
"neutral": 14.602725741019448,
"contradiction": 8.30428822860269
}
},
"Phi-2": {
"nq": {
"abstain": 3.0,
"entailment": 20.20246867881798,
"neutral": 32.207070269532366,
"contradiction": 47.59046105164965
},
"msmarco": {
"abstain": 18.0,
"entailment": 76.65605735727686,
"neutral": 5.782967032967033,
"contradiction": 17.5609756097561
},
"dolly": {
"abstain": 3.0,
"entailment": 88.40307097807987,
"neutral": 7.849456563107469,
"contradiction": 3.7474724588126644
},
"avg": {
"abstain": 8.0,
"entailment": 60.94396394933259,
"neutral": 15.7959652154687,
"contradiction": 23.260070835198707
}
}
}
}