Spaces:
Running
Running
File size: 3,033 Bytes
10d93f3 754d71d 534766c 4cec6db 754d71d 585a8ec bf877db 585a8ec 754d71d bf877db 585a8ec 31ed7d9 bf877db 31ed7d9 4cec6db 31ed7d9 4cec6db bf877db 4cec6db 10d93f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
"""Module containing performance results for the Llama-3-Supernova-Lite model."""
results_llama_supernova_lite = {
"name": "Llama-3.1-SuperNova-Lite",
"modelType": "Llama 3.1 8B",
"configurations": [
{
"instanceType": "c7i.4xlarge",
"configurations": [
{
"quantization": "Q6_K",
"container": "llama.cpp 10/18/24",
"status": "OK",
"tokensPerSecond": "xxx",
"notes": "AMX enabled, Flash Attention enabled",
},
{
"quantization": "Q5_K",
"container": "llama.cpp 10/18/24",
"status": "OK",
"tokensPerSecond": "xxx",
"notes": "AMX enabled, Flash Attention enabled",
},
{
"quantization": "Q4_K",
"container": "llama.cpp 10/18/24",
"status": "OK",
"tokensPerSecond": "xxx",
"notes": "AMX enabled, Flash Attention enabled",
},
{
"quantization": "IQ4_XS",
"container": "llama.cpp 10/18/24",
"status": "OK",
"tokensPerSecond": "xxx",
"notes": "AMX enabled, Flash Attention enabled",
},
],
},
{
"instanceType": "c7g.8xlarge",
"quantization": "Q4_0_8_8",
"container": "llama.cpp 9/18/24",
"status": "OK",
"tokensPerSecond": "39.7",
"notes": "requantized from Q4_K_S",
},
{
"instanceType": "c7g.16xlarge",
"quantization": "Q4_0_8_8",
"container": "llama.cpp 9/18/24",
"status": "OK",
"tokensPerSecond": "45.5",
"notes": "",
},
{
"instanceType": "c8g.4xlarge",
"quantization": "Q4_0_4_8",
"container": "llama.cpp 11/05/24",
"status": "OK",
"tokensPerSecond": "34",
"notes": "with Flash Attention",
},
{
"instanceType": "r8g.4xlarge",
"quantization": "Q4_0_4_8",
"container": "llama.cpp 9/11/24",
"status": "OK",
"tokensPerSecond": "40",
"notes": "with Flash Attention",
},
{
"instanceType": "r8g.8xlarge",
"quantization": "Q4_0_4_8",
"container": "llama.cpp 9/11/24",
"status": "OK",
"tokensPerSecond": "63",
"notes": "with Flash Attention",
},
{
"instanceType": "r8g.16xlarge",
"quantization": "Q4_0_4_8",
"container": "llama.cpp 9/11/24",
"status": "OK",
"tokensPerSecond": "70",
"notes": "with Flash Attention",
},
],
}
|