"""Module containing performance results for the Arcee-SuperNova model.""" results_arcee_supernova = { "name": "Arcee-SuperNova", "modelType": "Llama 3.1 70B", "configurations": [ { "instanceType": "c7g.16xlarge", "quantization": "Q4_0_8_8", "container": "llama.cpp 9/19/24", "status": "OK", "tokensPerSecond": "6.5", "notes": "", }, { "instanceType": "r8g.16xlarge", "quantization": "Q4_0_4_8", "container": "llama.cpp 9/19/24", "status": "OK", "tokensPerSecond": "9.8", "notes": "With Flash Attention", }, { "instanceType": "g5.12xlarge", "quantization": "awq", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "33", "notes": "MAX_INPUT_TOKENS: 8192, MAX_TOTAL_TOKENS: 16384", }, { "instanceType": "p4d.24xlarge", "quantization": "awq", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "58", "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768", }, { "instanceType": "p5.48xlarge", "quantization": "awq", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "73", "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768", }, { "instanceType": "inf2.24xlarge", "configurations": [ { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "KO", "tokensPerSecond": "-", "notes": "OOM bs=2,seqlen=4096 - SDK 2.19.1", }, { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "KO", "tokensPerSecond": "-", "notes": "OOM bs=2,seqlen=2048 - SDK 2.19.1", }, { "quantization": "8-bit", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "???", "tokensPerSecond": "???", "notes": "bs=2,seqlen=8192 - SDK 2.19.1 - OPTION_LOAD_IN_8BIT=True", }, ], }, { "instanceType": "inf2.48xlarge", "configurations": [ { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "OK", "tokensPerSecond": "28", "notes": "bs=4,seqlen=4096 - SDK 2.19.1", }, { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "OK", "tokensPerSecond": "24", "notes": "bs=2,seqlen=8192 - SDK 2.19.1", }, { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "KO", "tokensPerSecond": "-", "notes": "OOM bs=2,seqlen=16384 - SDK 2.19.1", }, ], }, { "instanceType": "trn1.32xlarge", "configurations": [ { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "OK", "tokensPerSecond": "32", "notes": "bs=2,seqlen=8192 - SDK 2.19.1", }, ], }, { "instanceType": "p4d.24xlarge", "configurations": [ { "quantization": "none", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "30", "notes": "", }, { "quantization": "none", "container": "LMI 0.29+vLLM 0.5.5", "status": "OK", "tokensPerSecond": "45", "notes": "", }, ], }, { "instanceType": "p5.48xlarge", "configurations": [ { "quantization": "none", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "58", "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768", }, { "quantization": "none", "container": "LMI 0.29+vLLM 0.5.5", "status": "OK", "tokensPerSecond": "70", "notes": "", }, ], }, ], }