Julien Simon commited on
Commit
430197f
1 Parent(s): 3a66d4b
results_arcee_nova.py CHANGED
@@ -153,12 +153,5 @@ results_arcee_nova = {
153
  "tokensPerSecond": "58",
154
  "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
155
  },
156
- {
157
- "instanceType": "inf2.*",
158
- "container": "TGI 2.2.0",
159
- "status": "not supported",
160
- "tokensPerSecond": "-",
161
- "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
162
- },
163
  ],
164
  }
 
153
  "tokensPerSecond": "58",
154
  "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
155
  },
 
 
 
 
 
 
 
156
  ],
157
  }
results_arcee_supernova.py CHANGED
@@ -33,17 +33,17 @@ results_arcee_supernova = {
33
  "configurations": [
34
  {
35
  "quantization": "none",
36
- "container": "transformers-neuronx",
37
  "status": "KO",
38
  "tokensPerSecond": "-",
39
- "notes": "OOM bs=2,seqlen=4096",
40
  },
41
  {
42
  "quantization": "none",
43
- "container": "transformers-neuronx",
44
  "status": "KO",
45
  "tokensPerSecond": "-",
46
- "notes": "OOM bs=2,seqlen=2048",
47
  },
48
  ],
49
  },
@@ -52,24 +52,24 @@ results_arcee_supernova = {
52
  "configurations": [
53
  {
54
  "quantization": "none",
55
- "container": "transformers-neuronx",
56
  "status": "OK",
57
  "tokensPerSecond": "28",
58
- "notes": "bs=4,seqlen=4096",
59
  },
60
  {
61
  "quantization": "none",
62
- "container": "transformers-neuronx",
63
  "status": "OK",
64
  "tokensPerSecond": "24",
65
- "notes": "bs=2,seqlen=8192",
66
  },
67
  {
68
  "quantization": "none",
69
- "container": "transformers-neuronx",
70
- "status": "?",
71
- "tokensPerSecond": "KO",
72
- "notes": "OOM bs=2,seqlen=16384",
73
  },
74
  ],
75
  },
@@ -85,8 +85,8 @@ results_arcee_supernova = {
85
  "instanceType": "p5.48xlarge",
86
  "quantization": "none",
87
  "container": "TGI 2.2.0",
88
- "status": "?",
89
- "tokensPerSecond": "?",
90
  "notes": "",
91
  },
92
  ],
 
33
  "configurations": [
34
  {
35
  "quantization": "none",
36
+ "container": "LMI 0.29+transformers-neuronx 0.11.351",
37
  "status": "KO",
38
  "tokensPerSecond": "-",
39
+ "notes": "OOM bs=2,seqlen=4096 - SDK 2.19.1",
40
  },
41
  {
42
  "quantization": "none",
43
+ "container": "LMI 0.29+transformers-neuronx 0.11.351",
44
  "status": "KO",
45
  "tokensPerSecond": "-",
46
+ "notes": "OOM bs=2,seqlen=2048 - SDK 2.19.1",
47
  },
48
  ],
49
  },
 
52
  "configurations": [
53
  {
54
  "quantization": "none",
55
+ "container": "LMI 0.29+transformers-neuronx 0.11.351",
56
  "status": "OK",
57
  "tokensPerSecond": "28",
58
+ "notes": "bs=4,seqlen=4096 - SDK 2.19.1",
59
  },
60
  {
61
  "quantization": "none",
62
+ "container": "LMI 0.29+transformers-neuronx 0.11.351",
63
  "status": "OK",
64
  "tokensPerSecond": "24",
65
+ "notes": "bs=2,seqlen=8192 - SDK 2.19.1",
66
  },
67
  {
68
  "quantization": "none",
69
+ "container": "LMI 0.29+transformers-neuronx 0.11.351",
70
+ "status": "KO",
71
+ "tokensPerSecond": "-",
72
+ "notes": "OOM bs=2,seqlen=16384 - SDK 2.19.1",
73
  },
74
  ],
75
  },
 
85
  "instanceType": "p5.48xlarge",
86
  "quantization": "none",
87
  "container": "TGI 2.2.0",
88
+ "status": "OK",
89
+ "tokensPerSecond": "58",
90
  "notes": "",
91
  },
92
  ],
results_llama_spark.py CHANGED
@@ -83,7 +83,7 @@ results_llama_spark = {
83
  "tokensPerSecond": "123",
84
  },
85
  {
86
- "container": "vLLM 0.5.5",
87
  "quantization": "none",
88
  "status": "OK",
89
  "tokensPerSecond": "106",
@@ -98,20 +98,13 @@ results_llama_spark = {
98
  "tokensPerSecond": "145",
99
  "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
100
  },
101
- {
102
- "instanceType": "inf2.*",
103
- "container": "TGI 2.2.0",
104
- "status": "not supported",
105
- "tokensPerSecond": "-",
106
- "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO",
107
- },
108
  {
109
  "instanceType": "inf2.2xlarge",
110
- "container": "transformers-neuronx 0.11.351",
111
  "quantization": "none",
112
  "status": "OK",
113
  "tokensPerSecond": "24",
114
- "notes": "Neuron SDK 2.19.1",
115
  },
116
  ],
117
  }
 
83
  "tokensPerSecond": "123",
84
  },
85
  {
86
+ "container": "LMI 0.29+vLLM 0.5.5",
87
  "quantization": "none",
88
  "status": "OK",
89
  "tokensPerSecond": "106",
 
98
  "tokensPerSecond": "145",
99
  "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
100
  },
 
 
 
 
 
 
 
101
  {
102
  "instanceType": "inf2.2xlarge",
103
+ "container": "LMI 0.29+transformers-neuronx 0.11.351",
104
  "quantization": "none",
105
  "status": "OK",
106
  "tokensPerSecond": "24",
107
+ "notes": "SDK 2.19.1",
108
  },
109
  ],
110
  }