[ { "Model": "claude-3-5-sonnet-20240620", "Mode": "greedy", "Puzzle Acc": "33.40", "Cell Acc": "54.34", "No answer": "0.00", "Easy Puzzle Acc": "87.50", "Hard Puzzle Acc": "12.36", "Total Puzzles": 1000, "Reason Lens": "1141.94" }, { "Model": "claude-3-5-sonnet-20240620", "Mode": "sampling", "Puzzle Acc": "33.40", "Cell Acc": "53.01", "No answer": "0.10", "Easy Puzzle Acc": "88.21", "Hard Puzzle Acc": "12.08", "Total Puzzles": 1000, "Reason Lens": "1153.83" }, { "Model": "gpt-4o-2024-05-13", "Mode": "sampling", "Puzzle Acc": "30.80", "Cell Acc": "46.19", "No answer": "6.60", "Easy Puzzle Acc": "81.07", "Hard Puzzle Acc": "11.25", "Total Puzzles": 1000, "Reason Lens": "1549.74" }, { "Model": "gpt-4-turbo-2024-04-09", "Mode": "greedy", "Puzzle Acc": "28.40", "Cell Acc": "47.90", "No answer": "0.10", "Easy Puzzle Acc": "80.71", "Hard Puzzle Acc": "8.06", "Total Puzzles": 1000, "Reason Lens": "1148.46" }, { "Model": "gpt-4o-2024-05-13", "Mode": "greedy", "Puzzle Acc": "28.20", "Cell Acc": "38.72", "No answer": "19.30", "Easy Puzzle Acc": "77.86", "Hard Puzzle Acc": "8.89", "Total Puzzles": 1000, "Reason Lens": "1643.51" }, { "Model": "gpt-4-0314", "Mode": "greedy", "Puzzle Acc": "27.10", "Cell Acc": "47.43", "No answer": "0.20", "Easy Puzzle Acc": "77.14", "Hard Puzzle Acc": "7.64", "Total Puzzles": 1000, "Reason Lens": "1203.17" }, { "Model": "claude-3-opus-20240229", "Mode": "greedy", "Puzzle Acc": "27.00", "Cell Acc": "48.91", "No answer": "0.00", "Easy Puzzle Acc": "78.21", "Hard Puzzle Acc": "7.08", "Total Puzzles": 1000, "Reason Lens": "855.72" }, { "Model": "gpt-4-turbo-2024-04-09", "Mode": "sampling", "Puzzle Acc": "26.40", "Cell Acc": "47.93", "No answer": "0.00", "Easy Puzzle Acc": "74.29", "Hard Puzzle Acc": "7.78", "Total Puzzles": 1000, "Reason Lens": "1165.90" }, { "Model": "deepseek-chat", "Mode": "greedy", "Puzzle Acc": "22.70", "Cell Acc": "42.46", "No answer": "5.20", "Easy Puzzle Acc": "68.57", "Hard Puzzle Acc": "4.86", "Total Puzzles": 1000, "Reason Lens": "1260.23" }, { "Model": "Qwen2-72B-Instruct", "Mode": "greedy", "Puzzle Acc": "21.40", "Cell Acc": "38.32", "No answer": "10.20", "Easy Puzzle Acc": "63.93", "Hard Puzzle Acc": "4.86", "Total Puzzles": 1000, "Reason Lens": "1813.82" }, { "Model": "deepseek-coder", "Mode": "greedy", "Puzzle Acc": "21.10", "Cell Acc": "41.58", "No answer": "4.90", "Easy Puzzle Acc": "64.64", "Hard Puzzle Acc": "4.17", "Total Puzzles": 1000, "Reason Lens": "1324.55" }, { "Model": "gemini-1.5-pro", "Mode": "sampling", "Puzzle Acc": "19.70", "Cell Acc": "45.24", "No answer": "0.40", "Easy Puzzle Acc": "60.00", "Hard Puzzle Acc": "4.03", "Total Puzzles": 1000, "Reason Lens": "1356.77" }, { "Model": "gemini-1.5-flash", "Mode": "greedy", "Puzzle Acc": "19.40", "Cell Acc": "31.77", "No answer": "22.70", "Easy Puzzle Acc": "59.29", "Hard Puzzle Acc": "3.89", "Total Puzzles": 1000, "Reason Lens": "1538.18" }, { "Model": "gemini-1.5-pro", "Mode": "greedy", "Puzzle Acc": "19.40", "Cell Acc": "44.59", "No answer": "0.80", "Easy Puzzle Acc": "55.71", "Hard Puzzle Acc": "5.28", "Total Puzzles": 1000, "Reason Lens": "1336.17" }, { "Model": "yi-large-preview", "Mode": "greedy", "Puzzle Acc": "18.90", "Cell Acc": "42.61", "No answer": "1.40", "Easy Puzzle Acc": "58.93", "Hard Puzzle Acc": "3.33", "Total Puzzles": 1000, "Reason Lens": "833.36" }, { "Model": "yi-large", "Mode": "greedy", "Puzzle Acc": "18.80", "Cell Acc": "39.83", "No answer": "1.80", "Easy Puzzle Acc": "58.21", "Hard Puzzle Acc": "3.47", "Total Puzzles": 1000, "Reason Lens": "757.01" }, { "Model": "claude-3-sonnet-20240229", "Mode": "greedy", "Puzzle Acc": "18.70", "Cell Acc": "43.66", "No answer": "0.00", "Easy Puzzle Acc": "58.93", "Hard Puzzle Acc": "3.06", "Total Puzzles": 1000, "Reason Lens": "1095.37" }, { "Model": "Qwen2-72B-Instruct", "Mode": "sampling", "Puzzle Acc": "18.70", "Cell Acc": "40.57", "No answer": "3.20", "Easy Puzzle Acc": "57.50", "Hard Puzzle Acc": "3.61", "Total Puzzles": 1000, "Reason Lens": "1894.72" }, { "Model": "gemini-1.5-flash", "Mode": "sampling", "Puzzle Acc": "18.40", "Cell Acc": "36.03", "No answer": "12.80", "Easy Puzzle Acc": "57.86", "Hard Puzzle Acc": "3.06", "Total Puzzles": 1000, "Reason Lens": "1713.03" }, { "Model": "Meta-Llama-3-70B-Instruct", "Mode": "greedy", "Puzzle Acc": "16.80", "Cell Acc": "42.31", "No answer": "0.20", "Easy Puzzle Acc": "52.86", "Hard Puzzle Acc": "2.78", "Total Puzzles": 1000, "Reason Lens": "809.95" }, { "Model": "gemma-2-27b-it@nvidia", "Mode": "greedy", "Puzzle Acc": "16.30", "Cell Acc": "41.18", "No answer": "1.10", "Easy Puzzle Acc": "50.71", "Hard Puzzle Acc": "2.92", "Total Puzzles": 1000, "Reason Lens": "1014.56" }, { "Model": "claude-3-haiku-20240307", "Mode": "greedy", "Puzzle Acc": "14.30", "Cell Acc": "37.87", "No answer": "0.10", "Easy Puzzle Acc": "47.86", "Hard Puzzle Acc": "1.25", "Total Puzzles": 1000, "Reason Lens": "1015.06" }, { "Model": "reka-core-20240501", "Mode": "greedy", "Puzzle Acc": "13.00", "Cell Acc": "33.88", "No answer": "4.00", "Easy Puzzle Acc": "43.21", "Hard Puzzle Acc": "1.25", "Total Puzzles": 1000, "Reason Lens": "1078.29" }, { "Model": "gemma-2-9b-it", "Mode": "greedy", "Puzzle Acc": "12.90", "Cell Acc": "37.07", "No answer": "0.50", "Easy Puzzle Acc": "42.14", "Hard Puzzle Acc": "1.53", "Total Puzzles": 1000, "Reason Lens": "859.14" }, { "Model": "gemma-2-9b-it@nvidia", "Mode": "greedy", "Puzzle Acc": "12.80", "Cell Acc": "36.79", "No answer": "0.00", "Easy Puzzle Acc": "41.79", "Hard Puzzle Acc": "1.53", "Total Puzzles": 1000, "Reason Lens": "849.84" }, { "Model": "Meta-Llama-3-8B-Instruct", "Mode": "greedy", "Puzzle Acc": "11.90", "Cell Acc": "23.70", "No answer": "29.20", "Easy Puzzle Acc": "40.71", "Hard Puzzle Acc": "0.69", "Total Puzzles": 1000, "Reason Lens": "1216.40" }, { "Model": "gpt-3.5-turbo-0125", "Mode": "greedy", "Puzzle Acc": "10.10", "Cell Acc": "33.06", "No answer": "0.10", "Easy Puzzle Acc": "33.57", "Hard Puzzle Acc": "0.97", "Total Puzzles": 1000, "Reason Lens": "820.66" }, { "Model": "reka-flash-20240226", "Mode": "greedy", "Puzzle Acc": "9.30", "Cell Acc": "25.67", "No answer": "18.70", "Easy Puzzle Acc": "30.71", "Hard Puzzle Acc": "0.97", "Total Puzzles": 1000, "Reason Lens": "1074.80" }, { "Model": "Qwen2-7B-Instruct", "Mode": "greedy", "Puzzle Acc": "8.40", "Cell Acc": "22.06", "No answer": "24.40", "Easy Puzzle Acc": "29.29", "Hard Puzzle Acc": "0.28", "Total Puzzles": 1000, "Reason Lens": "1473.23" } ]