Spaces:

arcee-ai
/

Benchmarks

Running

Benchmarks / results_arcee_nova.py

Julien Simon

Update

b63ff12 3 months ago

5.61 kB

	"""Module containing performance results for the Arcee-Nova model."""

	results_arcee_nova = {
	"name": "Arcee-Nova",
	"modelType": "Qwen2 72B",
	"notes": "",
	"configurations": [
	{
	"instanceType": "g4dn.12xlarge",
	"quantization": "bitsandbytes-nf4",
	"container": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "Flash Attention requires Ampere GPUs or newer",
	},
	{
	"instanceType": "g5.12xlarge",
	"configurations": [
	{
	"quantization": "bitsandbytes-nf4",
	"container": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "12",
	},
	{
	"quantization": "bitsandbytes-fp4",
	"container": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "12",
	},
	{
	"quantization": "bitsandbytes (int8)",
	"container": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "CUDA OOM",
	},
	{
	"quantization": "eetq (int8)",
	"container": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "[FT Error] Heurisitc failed to find a valid config.",
	},
	],
	},
	{
	"instanceType": "g5.48xlarge",
	"configurations": [
	{
	"quantization": "none",
	"container": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "CUDA OOM (but g6.48xlarge works!)",
	},
	{
	"quantization": "bitsandbytes-nf4",
	"container": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "12.3",
	},
	{
	"quantization": "bitsandbytes-fp4",
	"container": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "12.5",
	},
	{
	"quantization": "bitsandbytes (int8)",
	"container": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "The model deploys, but inference times out.",
	},
	],
	},
	{
	"instanceType": "g6.12xlarge",
	"configurations": [
	{
	"quantization": "bitsandbytes-nf4",
	"container": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "1.5-2",
	"notes": "Too slow, timeouts are likely",
	},
	{
	"quantization": "bitsandbytes-fp4",
	"container": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "2",
	"notes": "Too slow, timeouts are likely",
	},
	{
	"quantization": "bitsandbytes (int8)",
	"container": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "CUDA OOM",
	},
	],
	},
	{
	"instanceType": "g6.48xlarge",
	"quantization": "none",
	"container": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "12",
	},
	{
	"instanceType": "g6e.12xlarge",
	"configurations": [
	{
	"quantization": "none",
	"container": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "17",
	},
	{
	"quantization": "none",
	"container": "vLLM 0.5.5",
	"status": "OK",
	"tokensPerSecond": "17.8",
	},
	{
	"quantization": "none",
	"container": "SGLang 0.2.13",
	"status": "OK",
	"tokensPerSecond": "18.2",
	},
	],
	},
	{
	"instanceType": "p4d.24xlarge",
	"quantization": "none",
	"container": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "40",
	"notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
	},
	{
	"instanceType": "p4de.24xlarge",
	"quantization": "none",
	"container": "TGI 2.2.0",
	"status": "waiting for quota",
	},
	{
	"instanceType": "p5.48xlarge",
	"quantization": "none",
	"container": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "58",
	"notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
	},
	{
	"instanceType": "inf2.*",
	"container": "TGI 2.2.0",
	"status": "not supported",
	"tokensPerSecond": "-",
	"notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
	},
	],
	}