"""Module containing performance results for the Arcee-SuperNova model.""" results_arcee_supernova = { "name": "Arcee-SuperNova", "modelType": "Llama 3.1 70B", "configurations": [ { "instanceType": "c7g.16xlarge", "quantization": "Q4_0_8_8", "container": "llama.cpp 9/19/24", "status": "OK", "tokensPerSecond": "6.5", "notes": "", }, { "instanceType": "r8g.16xlarge", "quantization": "Q4_0_4_8", "container": "llama.cpp 9/19/24", "status": "OK", "tokensPerSecond": "9.8", "notes": "With Flash Attention", }, { "instanceType": "g5.12xlarge", "quantization": "awq", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "33", "notes": "MAX_INPUT_TOKENS: 8192, MAX_TOTAL_TOKENS: 16384", }, { "instanceType": "g6e.2xlarge", "quantization": "awq (w4 g128)", "container": "vLLM 0.6.2", "status": "OK", "tokensPerSecond": "18", "notes": "--max-model-len 10000 --max-num-seqs 16 --enforce-eager", }, { "instanceType": "g6e.2xlarge", "quantization": "Q4_K_M", "container": "llama.cpp 10/2/24", "status": "OK", "tokensPerSecond": "16", "notes": "-ngl 81 -c 13000 -fa -t 8", }, { "instanceType": "p4d.24xlarge", "quantization": "awq", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "58", "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768", }, { "instanceType": "p5.48xlarge", "quantization": "awq", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "73", "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768", }, { "instanceType": "inf2.24xlarge", "configurations": [ { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "KO", "tokensPerSecond": "-", "notes": "OOM bs=2,seqlen=4096 - SDK 2.19.1", }, { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "KO", "tokensPerSecond": "-", "notes": "OOM bs=2,seqlen=2048 - SDK 2.19.1", }, { "quantization": "8-bit", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "???", "tokensPerSecond": "???", "notes": "bs=2,seqlen=8192 - SDK 2.19.1 - OPTION_LOAD_IN_8BIT=True", }, ], }, { "instanceType": "inf2.48xlarge", "configurations": [ { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "OK", "tokensPerSecond": "28", "notes": "bs=4,seqlen=4096 - SDK 2.19.1", }, { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "OK", "tokensPerSecond": "24", "notes": "bs=2,seqlen=8192 - SDK 2.19.1", }, { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "KO", "tokensPerSecond": "-", "notes": "OOM bs=2,seqlen=16384 - SDK 2.19.1", }, ], }, { "instanceType": "trn1.32xlarge", "configurations": [ { "quantization": "none", "container": "LMI 0.29+transformers-neuronx 0.11.351", "status": "OK", "tokensPerSecond": "32", "notes": "bs=2,seqlen=8192 - SDK 2.19.1", }, { "quantization": "none", "container": "LMI 0.30rc1", "status": "OK", "tokensPerSecond": "34", "notes": "bs=2,seqlen=8192 - SDK 2.20", }, ], }, { "instanceType": "p4d.24xlarge", "configurations": [ { "quantization": "none", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "30", "notes": "", }, { "quantization": "none", "container": "LMI 0.29+vLLM 0.5.5", "status": "OK", "tokensPerSecond": "45", "notes": "OPTION_MAX_MODEL_LEN 64k", }, ], }, { "instanceType": "p5.48xlarge", "configurations": [ { "quantization": "none", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "58", "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768", }, { "quantization": "none", "container": "LMI 0.29+vLLM 0.5.5", "status": "OK", "tokensPerSecond": "70", "notes": "OPTION_MAX_MODEL_LEN 128k", }, { "quantization": "none", "container": "LMI 0.29+vLLM 0.5.5", "status": "OK", "tokensPerSecond": "70", "notes": "OPTION_ENFORCE_EAGER=True", }, ], }, ], }