diff --git "a/static/eval_results/SI/all_model_keywords_stats.json" "b/static/eval_results/SI/all_model_keywords_stats.json" new file mode 100644--- /dev/null +++ "b/static/eval_results/SI/all_model_keywords_stats.json" @@ -0,0 +1,4952 @@ +{ + "Aquila_VL_2B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.23446107609710548 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.08500232938689507 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2736043135287443 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19099680045595863 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.39206349206349206 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.3004030430829456 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.08421801129956362 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.2897054521388083 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.10279080594456047 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.3078950207372175 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.2248398559924241 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.3533180891172854 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.11430966292465267 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.11601893140078427 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2219754327969366 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.1772030496280578 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.1884228017877996 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.43875114784205704 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.23519563962981577 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.28092356180071465 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.13944236147744013 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.3826225373137124 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.20221672149607509 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6020225563909773 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.2521179990443663 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.19504930283108274 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.2374462987863378 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.0625675073438388 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.3521969849344277 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.18502360430789122 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.17480107496737848 + } + } + }, + "Aria": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.38003253384687213 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.33746818901184633 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4097428531166082 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.22745674367681176 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4142857142857143 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.4433718463877228 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.10860172719687727 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.3496496998103286 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.04960831797041802 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.40912566596786665 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.3300885226603808 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.45572004760273754 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.259572791833904 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.27807228404309764 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3440023372395526 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3053148323646246 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.2579833154471113 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.4787572696663607 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.3082165471908181 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.45805038774421686 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.3227895527307711 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5240018518464876 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.3401734439719901 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7129097744360902 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.40684369400912745 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.300830802045758 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.33433893000455434 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.07560632809892315 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.49083567506460973 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.22595636868728874 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.3653361644690575 + } + } + }, + "Claude_3.5": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.6124985410830999 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.6692574633083122 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.5401030980230185 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4760293511799448 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4174603174603175 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.6061759059165749 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.315623741632974 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.5134329832846579 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.34512576094802216 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.6014068374421209 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.5589506892621444 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5314705050989759 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.4753194125515341 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.54981020669637 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5373019912310933 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.5072889926389097 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.5112348724553849 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6164633346451529 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.4712835541311676 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.5769294912151234 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.5556080592390198 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.6017116084931068 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.530309401925396 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7033233082706767 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.5757222503903228 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.5044379729567133 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.5499261524919171 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.19196633042767672 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.636886763741019 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.4511182385296208 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.6192941518442948 + } + } + }, + "Claude_3.5_new": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.565344887955182 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.6633000290867174 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.5737128945237007 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4831956110227109 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.6285714285714286 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.6465631513465354 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.3511145464456188 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.5580232103280633 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3619606028475468 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.5927094432064197 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.5899091882733952 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5838312144672865 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.4705509892153899 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.574168555556774 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5636254729390459 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.5249488326690246 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.5300876558354416 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6380252743889108 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.5106873710119535 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.6409616762702612 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.5638133905687104 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.6433122980573076 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.5426169039575065 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6839924812030076 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.6234123112506059 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.5171075478248572 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.583387314927874 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.22440221381985706 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.6507240054983652 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.48795977188332873 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.6242355223474262 + } + } + }, + "GPT_4o": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.5785991479925302 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.7387886231372116 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.6073751328612617 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4387500704123191 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.626984126984127 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.6418126331560571 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.302146719713088 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.5184702350129554 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3427989299648589 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.6086090683867454 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.533172482404735 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.6107746700730057 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.4938444672052553 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.6093502418300007 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5672657028463585 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.5351259728352326 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.6016521462358102 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6204512659058113 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.4632537848154335 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.6563556079088679 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.5370230887013343 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.6716375018861761 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.5506629280904943 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7342894736842105 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.6512174145248227 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.47164342831848766 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.5798789532163023 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.1970421212123289 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.6933181759121947 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.4267383416112408 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.6400436962819274 + } + } + }, + "GPT_4o_mini": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.4556095354808589 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.5484747566251307 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.535145025177205 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.33759329198549914 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4873015873015873 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.5428885848330401 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.22983305008250185 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.39601047285667923 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.1194248916897328 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.5198662454862603 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.4194828137611333 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5561844967140265 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.3779902828155749 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.4645916955325127 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.46322170353087255 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.381768248331173 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.42048902061937554 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5559184922821285 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.3777213713726476 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.5986898724975707 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.4761935495255144 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5775026308600164 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.4555977624507237 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7960714285714285 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.5451973412590135 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.37680368570252215 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.48241878593503174 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.17294565844175996 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.5987052352447554 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.33277278942510824 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.5024587375994013 + } + } + }, + "Gemini_1.5_flash_002": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.47501823646125113 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.4328505884518674 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.5102257466534984 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.33330909636235384 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.5095238095238095 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.5507427313044685 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.19508720733284174 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.4351415236240936 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3555116262572404 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.5404112582997231 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.42366990116355135 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5726107634234434 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.33154206029123856 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.3656537691630919 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4427944359714585 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3680682749954099 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.3994332512947306 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5646552101097555 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.377682596312313 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.5536141293443697 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.4434262068907506 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5949207694245245 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.4385603970138852 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7734661654135339 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.5163987806731475 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.35922563291424964 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.46460120838976576 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.12518962860872068 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.5961096083948861 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3571876703463106 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.4329456546880451 + } + } + }, + "Gemini_1.5_pro_002": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.48587549603174607 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.504539358390968 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.5660366627264668 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.4200866579901879 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.48888888888888893 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.5964613809728712 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.28536490696494377 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.500158537824293 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.3592697030984118 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.6217290675275775 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.5132563067393096 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5888558035357285 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.4060403716629095 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.42724302639929596 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5034399620483027 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.43754003302746525 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.4731762319443037 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.6245091608727974 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.48334866543174226 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.5644701189535662 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.4972242280053817 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5995804836966744 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.5090111123207751 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7830639097744362 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.5647567827649111 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.448099634405986 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.5220033468415737 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.178032259819607 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.6342882147970302 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3972807544005462 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.5000257619938475 + } + } + }, + "Idefics3": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.10420386904761905 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.03610947192711297 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.14759816564804443 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.07952603609985566 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.19999999999999998 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.17708549842279478 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.04525221984520586 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.1804888391778344 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.020659062938075456 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.21050154891192577 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.14766910173600153 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.171712228743858 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.06561871098996794 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.03857183991826921 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.12057604157917215 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.15091196450213815 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.053829016986911726 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.13726004635095543 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.10744987600153451 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.2975217887286715 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.02100010342704044 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.2126465842330819 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.1166739111764397 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.2774436090225564 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.1724799353848912 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.1275512898313342 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.12579260798514427 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.014803312629399587 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.15897902615904647 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.09276606649010487 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.07893017100109866 + } + } + }, + "InternVL2_2B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.09082268323996265 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.03678177133215256 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.20753533217719797 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.12084183290294437 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3428571428571428 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.19769593666817548 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.039950835968771276 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.15289272275533383 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.07184873949579831 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.18693717087010792 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.15159509081542988 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.22923075081716637 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.09447908809124074 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.049217594376760605 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.14262795568189013 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.12369372450210245 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.11544832152620972 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.3044601862783681 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.12291071957107838 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.24746476545671045 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.042960275283590164 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.3035836752792625 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.08201891993308255 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.4728533834586467 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.1905261989833371 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.1336101595652968 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.13333012698269087 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.013664596273291925 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.23055380602943532 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.11985812372011641 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.12376971454228163 + } + } + }, + "InternVL2_76B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.42624956232493 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.4585598678029664 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.47251288369387245 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.3075073077960568 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.5301587301587302 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.5361401478255164 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.1619462380866451 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.38874625564304305 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.30169355252977215 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.509332186292545 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.39253566766026804 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5065289649268628 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.3333759749379774 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.39401514252711556 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4205132675160581 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3863929410693585 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.4041893680050902 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5389260571078752 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.34950523809271744 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.48322911874283003 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.4030580663588658 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5873606708191794 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.37110860027855824 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7041804511278196 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.470239452171767 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.3413715846680563 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.4230856844269695 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.10153556963007855 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.570666577587141 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.3276283897777921 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.4672429826553732 + } + } + }, + "InternVL2_8B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.34736300770308126 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.25646898023629483 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.34366199611891174 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.23531351908862871 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3253968253968254 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.3784296942438538 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.09134825639389237 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.2912783917684807 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.0503849634147267 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.34383350461121587 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.27187498646061353 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4088467630174509 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.21516421271234623 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.22539102164423624 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.29215647267040246 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.25281668404704594 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.2452385560845516 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.4334863789409244 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.26248166960198344 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3417106670258814 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.27991889529924496 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.4403771552269444 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.27396131593770284 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6521729323308272 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.3284779417766259 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.24983605813271914 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.2915702951202482 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.0592961015994038 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.41603267498315427 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.21701915158341967 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.30220279568886643 + } + } + }, + "Llama_3_2_11B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.14131944444444444 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.23423754995839735 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.33493936008655223 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.12719796356144183 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.22857142857142856 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.2740778723883188 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.09595984705908096 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.18716549835825297 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.14822411270107955 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.3275861238187186 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.1970899659349296 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.3387317156024255 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.13775107230512224 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.18967604731477847 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.23165426777444673 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.15123880546660726 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.16571305203663964 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.3762691853600945 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.16301171403498463 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.34463240030392384 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.24509462859331077 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.39649168256429074 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.21893599730050764 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.5728796992481204 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.25994005315432245 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.14653430680774066 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.2546845731733449 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.058403715092363084 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.37246318118748967 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.15806381880426276 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.20716804318138016 + } + } + }, + "MiniCPM_v2.6": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.20497125933706817 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.21340553041678637 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.33417132133610217 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.14556723677922526 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3507936507936508 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.3620762837308124 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.07517089101065139 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.25260048981169975 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.012567281814686655 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.34994481629202306 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.23021362338817897 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.3681846956052881 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.17128318830807052 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.21066692306852683 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.25947537124244935 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.23679437883858215 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.21540007432647457 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.39586776859504136 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.2036075191422558 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3711731498662282 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.20284349423013687 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.45156722842924535 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.2244713686485571 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.660718045112782 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.3045977370408878 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.18352505380246076 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.2657183000752527 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.06087615859328559 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.3977302205205499 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.17375496033997198 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.26814713591233313 + } + } + }, + "Molmo_72B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.2582151610644257 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.5042591723808818 + }, + "Knowledge": { + "count": 77, + "num_samples": 1279, + "tasks": [], + "average_score": 0.39648868632862583 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.2954490282663994 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.6714285714285714 + }, + "Perception": { + "count": 82, + "num_samples": 1306, + "tasks": [], + "average_score": 0.4704848349431393 + }, + "Planning": { + "count": 44, + "num_samples": 698, + "tasks": [], + "average_score": 0.13015529062282669 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.3557374102316002 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.18757766329699532 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.4405271103381682 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1509, + "tasks": [], + "average_score": 0.35176591065677537 + }, + "Photographs": { + "count": 83, + "num_samples": 1300, + "tasks": [], + "average_score": 0.47052754190598134 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.24743187516175363 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1107, + "tasks": [], + "average_score": 0.3754692399771127 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5182, + "tasks": [], + "average_score": 0.3757024328002091 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3048441329189725 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.36443166533642163 + }, + "multiple_choice": { + "count": 33, + "num_samples": 552, + "tasks": [], + "average_score": 0.5421225239407056 + }, + "numerical_data": { + "count": 39, + "num_samples": 679, + "tasks": [], + "average_score": 0.3342330361070466 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.4120820025247545 + }, + "structured_output": { + "count": 72, + "num_samples": 1105, + "tasks": [], + "average_score": 0.3670439889863054 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 639, + "tasks": [], + "average_score": 0.445412976139552 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.3070615049173117 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 155, + "tasks": [], + "average_score": 0.5953120300751881 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1698, + "tasks": [], + "average_score": 0.4110431137367615 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1615, + "tasks": [], + "average_score": 0.2983397150768741 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.4223762317042425 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 340, + "tasks": [], + "average_score": 0.07825953913967484 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.5756984198310193 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1257, + "tasks": [], + "average_score": 0.29197652844726363 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1672, + "tasks": [], + "average_score": 0.41462128751753047 + } + } + }, + "Molmo_7B_D": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.1158110119047619 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.27184002856754413 + }, + "Knowledge": { + "count": 77, + "num_samples": 1279, + "tasks": [], + "average_score": 0.2787344822161389 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.1740048655548875 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3619047619047619 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.30311570603428784 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.06424366688759706 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.173722800705029 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.09043432702433757 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.3106093738160722 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.21356852314768052 + }, + "Photographs": { + "count": 83, + "num_samples": 1300, + "tasks": [], + "average_score": 0.323282724310645 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.1327652104313917 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.18796442406686825 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5213, + "tasks": [], + "average_score": 0.22943156697817663 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.17305260714177756 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.17907829453546903 + }, + "multiple_choice": { + "count": 33, + "num_samples": 552, + "tasks": [], + "average_score": 0.3169618260527351 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.22086240998395923 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.324079404512755 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.21610753722787088 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 639, + "tasks": [], + "average_score": 0.32356781790614975 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.19244928377978027 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 155, + "tasks": [], + "average_score": 0.4433947368421053 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1698, + "tasks": [], + "average_score": 0.25685172601108597 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.17259103199957743 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.24958564675030656 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.035588894400059294 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.35830528296805764 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.1939605648275455 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.2239160707791646 + } + } + }, + "NVLM": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.37153871965452856 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.352859881186271 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.37572531212341936 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.2786818799518423 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3047619047619048 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.45079588183469294 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.1252138046141793 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.3518857602487131 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.09447890526012262 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.4387718807206103 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.32094439294995036 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4332099707344069 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.30070480033875985 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.2814148428882822 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.345503562629823 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3215154320779893 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.29287492253780084 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5016004197822379 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.28793758479482745 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3828322321439372 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.34135355449546323 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.45915496990325566 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.3152573721587561 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6521954887218044 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.38986101015677044 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.30043411704099793 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.3359094293956291 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.07615011020932495 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.46386896656934745 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.26907670581189963 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.3943476764428869 + } + } + }, + "POINTS_7B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.3151282387955181 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.22503259387671015 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.27361452525243724 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19633555542091463 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.34761904761904755 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.3737263982731003 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.08476480516686397 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.2606187882141402 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.1499797713556708 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.33916980654110634 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.25684059763242745 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.3523684400745285 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.19332242733156837 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.18689735511962233 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2615189201461682 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.23004840221723208 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.239982641771955 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.4200183654729108 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.23646374895042882 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.28263350209672056 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.2320749867881998 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.36827291874151846 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.21311917080615544 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.48204135338345855 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.2799740367463896 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.22387504020162652 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.27890902837062037 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.06502747891786666 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.37373928195086786 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.21857370538972226 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.2684488868499041 + } + } + }, + "Phi-3.5-vision": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.3150531045751634 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.18412184931451608 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.3374902354661273 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19473774010136682 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4142857142857143 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.38360617573843164 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.09254779551496593 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.3034971430938622 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.04423070234557053 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.3249099963089235 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.2797292831010349 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4073649042468842 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.1852656532829957 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.18482544209393917 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2789407286767065 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.2141318618135909 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.23002523914604356 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.46076785167694245 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.20335546763980886 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.38510487366381607 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.2567782320477167 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.44526176399160444 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.26422404318271525 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6958045112781954 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.3097558922032538 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.22905610983444738 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.2845968124529633 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.08173397535709728 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.4299430434813172 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.21524515429041854 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.2682909697086125 + } + } + }, + "Pixtral_12B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.3689221521942111 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.4143415072482432 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.37374171749764634 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.27839183583970506 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3444444444444444 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.46377210154054166 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.122839038193565 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.35876745089800455 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.11048396896880823 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.3947713702430871 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.36461586731895695 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4327891810625066 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.2688429906381188 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.31669784888602887 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3567653041737331 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3161209026018942 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.29510067482559116 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5076172985263894 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.3135393276021012 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3995518703501119 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.36511340930610364 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.4193828210432134 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.35085932465399283 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6302142857142857 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.38842270268832113 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.3055711926752603 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.37359181974124417 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.08507904212012304 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.4677006268371793 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.28269833721806076 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.39551360119171197 + } + } + }, + "Qwen2_VL_2B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.18075323879551822 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.21948696002702636 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.28841305815072016 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.16147424237969243 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.36984126984126986 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.33781829803679747 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.08656714113327156 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.2448949597527861 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.09293971931071163 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.2842921728720087 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.23259922343062173 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.36205043973893236 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.13312812081322709 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.1930642044577058 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.24428672223428244 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.1652854805017628 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.17061075451792151 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.42328479601206864 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.23904036592289388 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3296071840681468 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.23210528191388644 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.37769658880841467 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.21906286524745977 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.5887067669172933 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.27091980735233906 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.19211647307230917 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.25965511679594977 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.07432337143230854 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.3778480095314066 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.19305913502727232 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.2253353309586889 + } + } + }, + "Qwen2_VL_72B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.48352372198879545 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.6323628750211533 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4874613649312476 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.3355316008767396 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.39365079365079364 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.6141225191470527 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.23323065689783842 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.41914085094672937 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.18309869697155778 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.5251544991587351 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.4473618716373871 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.5308367876160253 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.4333175250859433 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.5070634902661117 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4834475464413966 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.45605294241715827 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.4608929319719144 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5851458306003763 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.44066773476234555 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.4974532098882374 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.49191356756271953 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.5782670824874114 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.43580017776139807 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.7294097744360902 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.5387802130987105 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.40095954140813556 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.49559260360544427 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.1474368760019346 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.6040487985710314 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.367367170491919 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.5688395686544739 + } + } + }, + "Qwen2_VL_7B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.36572347689075624 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.44618631789079277 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.40527029084195965 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.25874500882297563 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3507936507936508 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.47845712831657317 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.13224920829749706 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.28910547521894076 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.167887099917599 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.39575781162159634 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.3279988413468837 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.4722059967533397 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.27651089530142536 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.3555822260000372 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3669159632302898 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.3343930759222326 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.3068323820854221 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5064978792251521 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.31569247186288174 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.39180263622429157 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.38908261098680974 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.4927960336040459 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.31735419703044254 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6569285714285714 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.41129100495999377 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.28799562910106935 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.3844930054666535 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.08484497782236566 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.5151962864568788 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.2780300019986884 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.40533138482347386 + } + } + }, + "llava_onevision_72B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.3101241538281979 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.21993316800752236 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4073185744352188 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.30843360355217414 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.4857142857142857 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.4151635490932759 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.14332941205758537 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.34229099411259356 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.15000864315905132 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.48700494939767686 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.3420108320438131 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.46321361231985364 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.1991087184305048 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.20630840715151963 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.32994677641726666 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.2595306800419483 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.3154587757748795 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5216100397918579 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.29549573982348826 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3969569321996683 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.28638031668330033 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.49641793863653866 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.34020787956522225 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.677251879699248 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.367151258145213 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.2882162928135965 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.35493339032346644 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.08886502118921868 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.49931032043437723 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.28423002295958694 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.2705047345723313 + } + } + }, + "llava_onevision_7B": { + "app": { + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.20031585550887018 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 644, + "tasks": [], + "average_score": 0.1340041159644947 + }, + "Knowledge": { + "count": 77, + "num_samples": 1294, + "tasks": [], + "average_score": 0.32565632074201306 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.19520567001898761 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.5126984126984127 + }, + "Perception": { + "count": 82, + "num_samples": 1321, + "tasks": [], + "average_score": 0.3545352938542377 + }, + "Planning": { + "count": 44, + "num_samples": 714, + "tasks": [], + "average_score": 0.10542024755948716 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.27440171167785654 + } + }, + "input_format": { + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.1783310257200802 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 389, + "tasks": [], + "average_score": 0.39584024260311845 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1524, + "tasks": [], + "average_score": 0.252511232938778 + }, + "Photographs": { + "count": 83, + "num_samples": 1315, + "tasks": [], + "average_score": 0.41346984169922946 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.1159417852705533 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1123, + "tasks": [], + "average_score": 0.1368238769607056 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.25687697499702805 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 975, + "tasks": [], + "average_score": 0.19203135933620985 + }, + "exact_text": { + "count": 57, + "num_samples": 880, + "tasks": [], + "average_score": 0.2490174433570946 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.43553281735099914 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.22047389017098817 + }, + "open_ended_output": { + "count": 51, + "num_samples": 991, + "tasks": [], + "average_score": 0.3490743804978922 + }, + "structured_output": { + "count": 72, + "num_samples": 1121, + "tasks": [], + "average_score": 0.19236693222061413 + } + }, + "skills": { + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 654, + "tasks": [], + "average_score": 0.4322205869643684 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 897, + "tasks": [], + "average_score": 0.24367762339842414 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.5779849624060149 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1713, + "tasks": [], + "average_score": 0.28693734738201987 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1630, + "tasks": [], + "average_score": 0.19593817255686638 + }, + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2714, + "tasks": [], + "average_score": 0.292593666904816 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 356, + "tasks": [], + "average_score": 0.07666140459493773 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.44333006096492455 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1273, + "tasks": [], + "average_score": 0.2134151671467958 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1687, + "tasks": [], + "average_score": 0.19363816536239586 + } + } + } +} \ No newline at end of file