Spaces:
Running
Running
File size: 6,409 Bytes
0c0f086 b194192 4cec6db 2ac5435 4cec6db 0c0f086 ce1f985 0c0f086 6b33c1f 22854a2 58285ac 3a66d4b 58285ac bf572e3 430197f bf572e3 430197f bf572e3 430197f bf572e3 e850022 430197f bf572e3 ea42ef4 534766c ea42ef4 bf572e3 fe190a7 10d93f3 430197f 10d93f3 430197f 10d93f3 430197f 10d93f3 430197f 10d93f3 430197f 10d93f3 fe190a7 ea42ef4 d17a4c1 ea42ef4 0c0f086 8639c9c d17a4c1 8639c9c 10d93f3 8639c9c 2b18bbe 8639c9c d17a4c1 8639c9c 0c0f086 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
"""Module containing performance results for the Arcee-SuperNova model."""
results_arcee_supernova = {
"name": "Arcee-SuperNova",
"modelType": "Llama 3.1 70B",
"configurations": [
{
"instanceType": "c7g.16xlarge",
"quantization": "Q4_0_8_8",
"container": "llama.cpp 9/19/24",
"status": "OK",
"tokensPerSecond": "6.5",
"notes": "",
},
{
"instanceType": "r8g.16xlarge",
"quantization": "Q4_0_4_8",
"container": "llama.cpp 9/19/24",
"status": "OK",
"tokensPerSecond": "9.8",
"notes": "With Flash Attention",
},
{
"instanceType": "g5.12xlarge",
"quantization": "awq",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "33",
"notes": "MAX_INPUT_TOKENS: 8192, MAX_TOTAL_TOKENS: 16384",
},
{
"instanceType": "g6e.2xlarge",
"quantization": "awq (w4 g128)",
"container": "vLLM 0.6.2",
"status": "OK",
"tokensPerSecond": "18",
"notes": "--max-model-len 10000 --max-num-seqs 16 --enforce-eager",
},
{
"instanceType": "g6e.2xlarge",
"quantization": "Q4_K_M",
"container": "llama.cpp 10/2/24",
"status": "OK",
"tokensPerSecond": "16",
"notes": "-ngl 81 -c 13000 -fa -t 8",
},
{
"instanceType": "p4d.24xlarge",
"quantization": "awq",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "58",
"notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768",
},
{
"instanceType": "p5.48xlarge",
"quantization": "awq",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "73",
"notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768",
},
{
"instanceType": "inf2.24xlarge",
"configurations": [
{
"quantization": "none",
"container": "LMI 0.29+transformers-neuronx 0.11.351",
"status": "KO",
"tokensPerSecond": "-",
"notes": "OOM bs=2,seqlen=4096 - SDK 2.19.1",
},
{
"quantization": "none",
"container": "LMI 0.29+transformers-neuronx 0.11.351",
"status": "KO",
"tokensPerSecond": "-",
"notes": "OOM bs=2,seqlen=2048 - SDK 2.19.1",
},
{
"quantization": "8-bit",
"container": "LMI 0.29+transformers-neuronx 0.11.351",
"status": "???",
"tokensPerSecond": "???",
"notes": "bs=2,seqlen=8192 - SDK 2.19.1 - OPTION_LOAD_IN_8BIT=True",
},
],
},
{
"instanceType": "inf2.48xlarge",
"configurations": [
{
"quantization": "none",
"container": "LMI 0.29+transformers-neuronx 0.11.351",
"status": "OK",
"tokensPerSecond": "28",
"notes": "bs=4,seqlen=4096 - SDK 2.19.1",
},
{
"quantization": "none",
"container": "LMI 0.29+transformers-neuronx 0.11.351",
"status": "OK",
"tokensPerSecond": "24",
"notes": "bs=2,seqlen=8192 - SDK 2.19.1",
},
{
"quantization": "none",
"container": "LMI 0.29+transformers-neuronx 0.11.351",
"status": "KO",
"tokensPerSecond": "-",
"notes": "OOM bs=2,seqlen=16384 - SDK 2.19.1",
},
],
},
{
"instanceType": "trn1.32xlarge",
"configurations": [
{
"quantization": "none",
"container": "LMI 0.29+transformers-neuronx 0.11.351",
"status": "OK",
"tokensPerSecond": "32",
"notes": "bs=2,seqlen=8192 - SDK 2.19.1",
},
{
"quantization": "none",
"container": "LMI 0.30rc1",
"status": "OK",
"tokensPerSecond": "34",
"notes": "bs=2,seqlen=8192 - SDK 2.20",
},
],
},
{
"instanceType": "p4d.24xlarge",
"configurations": [
{
"quantization": "none",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "30",
"notes": "",
},
{
"quantization": "none",
"container": "LMI 0.29+vLLM 0.5.5",
"status": "OK",
"tokensPerSecond": "45",
"notes": "OPTION_MAX_MODEL_LEN 64k",
},
],
},
{
"instanceType": "p5.48xlarge",
"configurations": [
{
"quantization": "none",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "58",
"notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768",
},
{
"quantization": "none",
"container": "LMI 0.29+vLLM 0.5.5",
"status": "OK",
"tokensPerSecond": "70",
"notes": "OPTION_MAX_MODEL_LEN 128k",
},
{
"quantization": "none",
"container": "LMI 0.29+vLLM 0.5.5",
"status": "OK",
"tokensPerSecond": "70",
"notes": "OPTION_ENFORCE_EAGER=True",
},
],
},
],
}
|