"""Module containing performance results for the Arcee-Lite model.""" results_arcee_lite = { "name": "Arcee-Lite", "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B", "configurations": [ { "instanceType": "c6i.xlarge", "quantization": "bitsandbytes-nf4", "container": "TGI 2.2.0", "status": "KO", "tokensPerSecond": "-", "notes": "OOM, might work with a prequantized model", }, { "instanceType": "c6i.2xlarge", "quantization": "bitsandbytes-nf4", "container": "TGI 2.2.0", "status": "KO", "tokensPerSecond": "-", "notes": "OOM, might work with a prequantized model", }, { "instanceType": "c6i.4xlarge", "configurations": [ { "quantization": "none", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "10.7", }, { "quantization": "bitsandbytes (int8)", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "10.5", }, { "quantization": "bitsandbytes-nf4", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "10.6", }, ], }, { "instanceType": "c7i.4xlarge", "quantization": "none", "container": "TGI 2.2.0", "status": "waiting for quota", "tokensPerSecond": "-", }, { "instanceType": "g5.xlarge", "configurations": [ { "quantization": "none", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "110", }, { "quantization": "none", "container": "DJL 0.28 vLLM", "status": "OK", "tokensPerSecond": "105", "notes": '"OPTION_MAX_MODEL_LEN": "32768",', }, ], }, { "instanceType": "g6e.2xlarge", "configurations": [ { "container": "TGI 2.2.0", "quantization": "none", "status": "OK", "tokensPerSecond": "160", }, { "container": "SGLang 0.2.13", "quantization": "none", "status": "OK", "tokensPerSecond": "167", }, { "container": "vLLM 0.5.5", "quantization": "none", "status": "OK", "tokensPerSecond": "150", }, ], }, ], }