chatbot-arena-leaderboard / arena_hard_leaderboard_20240715.csv
connorchenn's picture
Upload arena_hard_leaderboard_20240715.csv
a63ab21 verified
raw
history blame
3.21 kB
model,score,rating_q025,rating_q975,CI,avg_tokens
gpt-4-turbo-2024-04-09,82.63,80.46,84.54,"(-2.17, +1.91)",662.0
claude-3-5-sonnet-20240620,79.35,77.39,81.06,"(-1.96, +1.71)",567.0
gpt-4o-2024-05-13,79.21,77.49,80.94,"(-1.72, +1.73)",696.0
gpt-4-0125-preview,77.96,76.14,79.9,"(-1.82, +1.94)",619.0
gemini-1.5-pro-api-0514,71.95,69.38,74.07,"(-2.57, +2.12)",676.0
yi-large-preview,71.47,69.28,73.64,"(-2.19, +2.17)",720.0
glm-4-0520,63.84,61.54,66.2,"(-2.30, +2.36)",636.0
yi-large,63.7,60.82,66.48,"(-2.88, +2.78)",626.0
deepseek-coder-v2,62.3,60.09,64.7,"(-2.21, +2.40)",578.0
claude-3-opus-20240229,60.35,57.24,62.66,"(-3.11, +2.31)",541.0
gemma-2-27b-it,57.51,55.1,59.64,"(-2.41, +2.13)",577.0
glm-4-0116,55.72,54.04,58.39,"(-1.68, +2.67)",622.0
gemini-1.5-pro-api-0409-preview,53.37,51.12,56.17,"(-2.25, +2.80)",478.0
glm-4-air,50.88,48.09,52.93,"(-2.79, +2.05)",619.0
gpt-4-0314,50.0,50.0,50.0,"(-0.00, +0.00)",423.0
gemini-1.5-flash-api-0514,49.61,47.12,52.24,"(-2.49, +2.63)",642.0
qwen2-72b-instruct,46.86,44.73,48.7,"(-2.13, +1.84)",515.0
claude-3-sonnet-20240229,46.8,43.82,49.13,"(-2.98, +2.33)",552.0
llama-3-70b-instruct,46.57,44.22,49.35,"(-2.35, +2.78)",591.0
claude-3-haiku-20240307,41.47,38.95,44.09,"(-2.52, +2.62)",505.0
gpt-4-0613,37.9,35.45,40.33,"(-2.45, +2.43)",354.0
mistral-large-2402,37.71,35.42,40.14,"(-2.29, +2.43)",400.0
mixtral-8x22b-instruct-v0.1,36.36,34.3,39.26,"(-2.06, +2.90)",430.0
qwen1.5-72b-chat,36.12,34.08,38.27,"(-2.04, +2.15)",474.0
phi-3-medium-4k-instruct,33.37,31.29,35.97,"(-2.08, +2.60)",517.0
command-r-plus,33.07,30.8,35.29,"(-2.27, +2.22)",541.0
mistral-medium,31.9,29.44,33.91,"(-2.46, +2.01)",485.0
phi-3-small-8k-instruct,29.77,27.47,32.1,"(-2.30, +2.33)",568.0
mistral-next,27.37,25.34,29.64,"(-2.03, +2.27)",297.0
gpt-3.5-turbo-0613,24.82,23.0,26.99,"(-1.82, +2.17)",401.0
dbrx-instruct-preview,24.63,22.94,26.63,"(-1.69, +2.00)",415.0
claude-2.0,23.99,22.16,25.78,"(-1.83, +1.79)",295.0
mixtral-8x7b-instruct-v0.1,23.4,21.55,25.4,"(-1.85, +2.00)",457.0
gpt-3.5-turbo-0125,23.34,21.31,24.86,"(-2.03, +1.52)",329.0
yi-34b-chat,23.15,21.26,25.17,"(-1.89, +2.02)",611.0
starling-lm-7b-beta,23.02,20.7,24.97,"(-2.32, +1.95)",530.0
claude-2.1,22.77,20.93,24.33,"(-1.84, +1.56)",290.0
snorkel-mistral-pairrm-dpo,20.73,18.75,22.79,"(-1.98, +2.06)",564.0
llama-3-8b-instruct,20.56,18.67,22.27,"(-1.89, +1.71)",585.0
gpt-3.5-turbo-1106,18.87,17.16,20.4,"(-1.71, +1.53)",285.0
gpt-3.5-turbo-0314,18.05,16.21,20.07,"(-1.84, +2.02)",334.0
gemini-pro,17.8,16.11,19.61,"(-1.69, +1.81)",322.0
snowflake-arctic-instruct,17.61,15.8,19.38,"(-1.81, +1.77)",365.0
command-r,17.02,15.63,18.76,"(-1.39, +1.74)",432.0
phi-3-mini-128k-instruct,15.42,13.94,17.05,"(-1.48, +1.63)",609.0
tulu-2-dpo-70b,14.99,13.37,16.93,"(-1.62, +1.94)",550.0
starling-lm-7b-alpha,12.8,11.45,14.33,"(-1.35, +1.53)",483.0
mistral-7b-instruct,12.58,11.08,13.64,"(-1.50, +1.06)",541.0
gemma-1.1-7b-it,12.09,10.7,13.16,"(-1.39, +1.07)",341.0
llama-2-70b-chat,11.55,10.27,12.72,"(-1.28, +1.17)",595.0
vicuna-33b,8.63,7.57,9.76,"(-1.06, +1.13)",451.0
gemma-7b-it,7.47,6.29,8.64,"(-1.18, +1.17)",378.0
gemma-1.1-2b-it,3.36,2.78,4.17,"(-0.58, +0.81)",316.0
gemma-2b-it,3.0,2.28,3.67,"(-0.72, +0.67)",369.0