Julien Simon commited on
Commit
0c0f086
1 Parent(s): 3fdf87c

Break results into one file per model

Browse files
.pre-commit-config.yaml CHANGED
@@ -57,13 +57,6 @@ repos:
57
  - id: cfn-lint
58
  files: cloudformation/.*\.(json|yml|yaml)$
59
 
60
- - repo: https://github.com/asottile/pyupgrade
61
- rev: v3.17.0
62
- hooks:
63
- - id: pyupgrade
64
- args: [--py310-plus]
65
- entry: bash -c 'pyupgrade "$@"; git add -u' --
66
-
67
  - repo: https://github.com/pre-commit/mirrors-mypy
68
  rev: v1.11.2
69
  hooks:
 
57
  - id: cfn-lint
58
  files: cloudformation/.*\.(json|yml|yaml)$
59
 
 
 
 
 
 
 
 
60
  - repo: https://github.com/pre-commit/mirrors-mypy
61
  rev: v1.11.2
62
  hooks:
.pylintrc CHANGED
@@ -1,2 +1,2 @@
1
  [MESSAGES CONTROL]
2
- disable=C0301,E0401,R0914
 
1
  [MESSAGES CONTROL]
2
+ disable=R0801,C0301,E0401,R0914,R1702
app.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  import logging
2
  import re
3
 
@@ -16,7 +21,7 @@ def get_model_names():
16
  Returns:
17
  list: Sorted list of model names.
18
  """
19
- return sorted([model['name'] for model in results['models']])
20
 
21
 
22
  def get_models_by_architecture(model_name):
@@ -29,12 +34,14 @@ def get_models_by_architecture(model_name):
29
  Returns:
30
  list: List of models with the same architecture.
31
  """
32
- selected_model = next((m for m in results['models'] if m['name'] == model_name), None)
 
 
33
  if not selected_model:
34
  return []
35
-
36
- model_type = selected_model.get('modelType', '')
37
- return [m for m in results['models'] if m.get('modelType', '') == model_type]
38
 
39
 
40
  def custom_sort_key(instance_type):
@@ -47,12 +54,24 @@ def custom_sort_key(instance_type):
47
  Returns:
48
  tuple: A tuple used for sorting, containing (family, size_index).
49
  """
50
- size_order = ['xlarge', '2xlarge', '4xlarge', '8xlarge', '12xlarge', '16xlarge', '24xlarge', '48xlarge']
51
-
52
- match = re.match(r'([a-z]+\d+)\.(\w+)', instance_type)
 
 
 
 
 
 
 
 
 
53
  if match:
54
  family, size = match.groups()
55
- return (family, size_order.index(size) if size in size_order else len(size_order))
 
 
 
56
  return (instance_type, 0) # Fallback for non-standard instance types
57
 
58
 
@@ -71,109 +90,122 @@ def display_results(model_name):
71
  try:
72
  models = get_models_by_architecture(model_name)
73
  if not models:
74
- logging.warning(f"No models found for {model_name}")
75
- return f"No results found for the selected model: {model_name}", pd.DataFrame()
76
-
77
- model_type = models[0].get('modelType', 'N/A')
78
- data = {}
 
 
 
79
  merged_models = set()
80
 
81
  for model in models:
82
- merged_models.add(model.get('name', 'Unknown'))
83
- for config in model.get('configurations', []):
84
  try:
85
- cloud = config.get('cloud', 'N/A')
86
- instance_type = config.get('instanceType', 'N/A')
87
- key = (cloud, instance_type)
88
-
89
- if 'configurations' in config:
90
- for nested_config in config['configurations']:
91
- nested_key = key + (nested_config.get('quantization', 'N/A'),)
92
- data[nested_key] = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  "Cloud": cloud,
94
  "Instance Type": instance_type,
95
- "GPU": config.get('gpu', 'N/A'),
96
- "GPU RAM": config.get('gpuRAM', 'N/A'),
97
- "Status": nested_config.get('status', 'N/A'),
98
- "Quantization": nested_config.get('quantization', 'N/A'),
99
- "Container": nested_config.get('container', nested_config.get('tgi', 'N/A')),
100
- "Tokens per Second": nested_config.get('tokensPerSecond', 'N/A'),
101
- "Notes": nested_config.get('notes', ''),
 
 
 
 
102
  }
103
- else:
104
- # Generate a unique key for each configuration
105
- unique_key = key + (config.get('quantization', 'N/A'), len(data))
106
- data[unique_key] = {
107
- "Cloud": cloud,
108
- "Instance Type": instance_type,
109
- "GPU": config.get('gpu', 'N/A'),
110
- "GPU RAM": config.get('gpuRAM', 'N/A'),
111
- "Status": config.get('status', 'N/A'),
112
- "Quantization": config.get('quantization', 'N/A'),
113
- "Container": config.get('container', config.get('tgi', 'N/A')),
114
- "Tokens per Second": config.get('tokensPerSecond', 'N/A'),
115
- "Notes": config.get('notes', ''),
116
- }
117
- except Exception as e:
118
- print(f"Error processing configuration: {e}")
119
  continue
120
 
121
  if not data:
122
- logging.warning(f"No data extracted for {model_name}")
123
- return f"No data could be extracted for the selected model: {model_name}", pd.DataFrame()
124
-
125
- # Merge data if there are conflicts
126
- for key, value in data.items():
127
- for field in value:
128
- if value[field] == 'N/A':
129
- for other_key, other_value in data.items():
130
- if other_key[0] == key[0] and other_value[field] != 'N/A':
131
- value[field] = other_value[field]
132
- break
133
-
134
- # Filter out rows where Status is 'N/A'
135
- data = {k: v for k, v in data.items() if v['Status'] != 'N/A'}
136
 
137
- merged_models_message = f"Note: Results merged from models: {', '.join(merged_models)}" if len(merged_models) > 1 else None
 
 
 
 
138
 
139
- # Sort the data by instance type
140
- sorted_data = sorted(data.values(), key=lambda x: custom_sort_key(x['Instance Type']))
141
 
142
- results = f"## Results for {model_name}\n\nModel Type: {model_type}"
143
  if merged_models_message:
144
- results += f"\n\n{merged_models_message}"
145
-
146
  df = pd.DataFrame(sorted_data)
147
-
148
  def color_status(val):
149
- if val == 'OK':
150
- return 'background-color: green; color: white'
151
- elif val == 'KO':
152
- return 'background-color: red; color: white'
153
- else:
154
- return ''
 
 
 
155
 
156
- styled_df = df.style.applymap(color_status, subset=['Status'])
157
-
158
- return results, styled_df
 
 
 
159
 
160
- except Exception as e:
161
- logging.exception(f"Error in display_results: {e}")
162
- return f"An error occurred while processing results for {model_name}: {str(e)}", pd.DataFrame()
163
 
164
  with gr.Blocks() as demo:
165
  gr.Markdown("# Model Benchmark Results")
166
- gr.Markdown("This table shows the benchmark results for each model. Container settings ([TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher), [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html), etc.) are default unless noted.")
 
 
 
 
167
  model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
168
-
169
  results_text = gr.Markdown()
170
  results_output = gr.DataFrame(label="Results")
171
-
172
  model_dropdown.change(
173
- display_results,
174
- inputs=[model_dropdown],
175
- outputs=[results_text, results_output]
176
  )
177
 
178
- if __name__ == "__main__":
179
- demo.launch()
 
1
+ """
2
+ This module provides functionality for displaying and analyzing model benchmark results.
3
+ It includes functions for data processing, sorting, and a Gradio interface for user interaction.
4
+ """
5
+
6
  import logging
7
  import re
8
 
 
21
  Returns:
22
  list: Sorted list of model names.
23
  """
24
+ return sorted([model["name"] for model in results["models"]])
25
 
26
 
27
  def get_models_by_architecture(model_name):
 
34
  Returns:
35
  list: List of models with the same architecture.
36
  """
37
+ selected_model = next(
38
+ (m for m in results["models"] if m["name"] == model_name), None
39
+ )
40
  if not selected_model:
41
  return []
42
+
43
+ model_type = selected_model.get("modelType", "")
44
+ return [m for m in results["models"] if m.get("modelType", "") == model_type]
45
 
46
 
47
  def custom_sort_key(instance_type):
 
54
  Returns:
55
  tuple: A tuple used for sorting, containing (family, size_index).
56
  """
57
+ size_order = [
58
+ "xlarge",
59
+ "2xlarge",
60
+ "4xlarge",
61
+ "8xlarge",
62
+ "12xlarge",
63
+ "16xlarge",
64
+ "24xlarge",
65
+ "48xlarge",
66
+ ]
67
+
68
+ match = re.match(r"([a-z]+\d+)\.(\w+)", instance_type)
69
  if match:
70
  family, size = match.groups()
71
+ return (
72
+ family,
73
+ size_order.index(size) if size in size_order else len(size_order),
74
+ )
75
  return (instance_type, 0) # Fallback for non-standard instance types
76
 
77
 
 
90
  try:
91
  models = get_models_by_architecture(model_name)
92
  if not models:
93
+ logging.warning("No models found for %s", model_name)
94
+ return (
95
+ f"No results found for the selected model: {model_name}",
96
+ pd.DataFrame(),
97
+ )
98
+
99
+ model_type = models[0].get("modelType", "N/A")
100
+ data = []
101
  merged_models = set()
102
 
103
  for model in models:
104
+ merged_models.add(model.get("name", "Unknown"))
105
+ for config in model.get("configurations", []):
106
  try:
107
+ cloud = config.get("cloud", "N/A")
108
+ instance_type = config.get("instanceType", "N/A")
109
+
110
+ if "configurations" in config:
111
+ for nested_config in config["configurations"]:
112
+ data.append(
113
+ {
114
+ "Cloud": cloud,
115
+ "Instance Type": instance_type,
116
+ "GPU": config.get("gpu", "N/A"),
117
+ "GPU RAM": config.get("gpuRAM", "N/A"),
118
+ "Status": nested_config.get("status", "N/A"),
119
+ "Quantization": nested_config.get(
120
+ "quantization", "N/A"
121
+ ),
122
+ "Container": nested_config.get(
123
+ "container",
124
+ nested_config.get("tgi", "N/A"),
125
+ ),
126
+ "Tokens per Second": nested_config.get(
127
+ "tokensPerSecond", "N/A"
128
+ ),
129
+ "Notes": nested_config.get("notes", ""),
130
+ }
131
+ )
132
+ else:
133
+ data.append(
134
+ {
135
  "Cloud": cloud,
136
  "Instance Type": instance_type,
137
+ "GPU": config.get("gpu", "N/A"),
138
+ "GPU RAM": config.get("gpuRAM", "N/A"),
139
+ "Status": config.get("status", "N/A"),
140
+ "Quantization": config.get("quantization", "N/A"),
141
+ "Container": config.get(
142
+ "container", config.get("tgi", "N/A")
143
+ ),
144
+ "Tokens per Second": config.get(
145
+ "tokensPerSecond", "N/A"
146
+ ),
147
+ "Notes": config.get("notes", ""),
148
  }
149
+ )
150
+ except (KeyError, ValueError, TypeError) as e:
151
+ logging.error("Error processing configuration: %s", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  continue
153
 
154
  if not data:
155
+ logging.warning("No data extracted for %s", model_name)
156
+ return (
157
+ f"No data for the selected model: {model_name}",
158
+ pd.DataFrame(),
159
+ )
 
 
 
 
 
 
 
 
 
160
 
161
+ merged_models_message = (
162
+ f"Note: Results merged from models: {', '.join(merged_models)}"
163
+ if len(merged_models) > 1
164
+ else None
165
+ )
166
 
167
+ sorted_data = sorted(data, key=lambda x: custom_sort_key(x["Instance Type"]))
 
168
 
169
+ result_text = f"## Results for {model_name}\n\nModel Type: {model_type}"
170
  if merged_models_message:
171
+ result_text += f"\n\n{merged_models_message}"
172
+
173
  df = pd.DataFrame(sorted_data)
174
+
175
  def color_status(val):
176
+ if val == "OK":
177
+ return "background-color: green; color: white"
178
+ if val == "KO":
179
+ return "background-color: red; color: white"
180
+ return ""
181
+
182
+ styled_df = df.style.applymap(color_status, subset=["Status"])
183
+
184
+ return result_text, styled_df
185
 
186
+ except (KeyError, ValueError, TypeError) as e:
187
+ logging.exception("Error in display_results: %s", e)
188
+ return (
189
+ f"An error for {model_name}: {str(e)}",
190
+ pd.DataFrame(),
191
+ )
192
 
 
 
 
193
 
194
  with gr.Blocks() as demo:
195
  gr.Markdown("# Model Benchmark Results")
196
+ gr.Markdown(
197
+ """This table shows the benchmark results for each model. \n
198
+ [TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher),
199
+ [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html), etc.) are default unless noted."""
200
+ )
201
  model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
202
+
203
  results_text = gr.Markdown()
204
  results_output = gr.DataFrame(label="Results")
205
+
206
  model_dropdown.change(
207
+ display_results, inputs=[model_dropdown], outputs=[results_text, results_output]
 
 
208
  )
209
 
210
+ if __name__ == "__main__":
211
+ demo.launch()
requirements.txt CHANGED
@@ -1 +1,2 @@
1
  gradio
 
 
1
  gradio
2
+ pandas
results.py CHANGED
@@ -1,699 +1,43 @@
1
  """Module containing model configuration results for various AI models and hardware setups."""
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  results = {
4
  "models": [
5
- {
6
- "name": "Arcee-Meraj",
7
- "modelType": "Qwen2 72B",
8
- "configurations": [
9
- {
10
- "region": "us-west-2",
11
- "instanceType": "g5.12xlarge",
12
- "cloud": "AWS",
13
- "gpu": "4xNVIDIA A10G",
14
- "gpuRAM": "96 GB",
15
- "quantization": "awq",
16
- "container": "TGI 2.2.0",
17
- "status": "OK",
18
- "tokensPerSecond": "33",
19
- "notes": "",
20
- },
21
- {
22
- "region": "us-west-2",
23
- "instanceType": "p4d.24xlarge",
24
- "cloud": "AWS",
25
- "gpu": "4xNVIDIA A100",
26
- "gpuRAM": "320 GB",
27
- "quantization": "none",
28
- "container": "TGI 2.2.0",
29
- "status": "OK",
30
- "tokensPerSecond": "38",
31
- "notes": "",
32
- },
33
- ],
34
- },
35
- {
36
- "name": "Arcee-SuperNova",
37
- "modelType": "Llama 3.1 70B",
38
- "configurations": [
39
- {
40
- "region": "us-west-2",
41
- "instanceType": "g5.12xlarge",
42
- "cloud": "AWS",
43
- "gpu": "4xNVIDIA A10G",
44
- "gpuRAM": "96 GB",
45
- "quantization": "awq",
46
- "container": "TGI 2.2.0",
47
- "status": "OK",
48
- "tokensPerSecond": "33",
49
- "notes": "",
50
- },
51
- {
52
- "region": "us-west-2",
53
- "instanceType": "p4d.24xlarge",
54
- "cloud": "AWS",
55
- "gpu": "4xNVIDIA A100",
56
- "gpuRAM": "320 GB",
57
- "quantization": "none",
58
- "container": "TGI 2.2.0",
59
- "status": "OK",
60
- "tokensPerSecond": "38",
61
- "notes": "",
62
- },
63
- ],
64
- },
65
- {
66
- "name": "Arcee-Nova",
67
- "modelType": "Qwen2 72B",
68
- "notes": "",
69
- "configurations": [
70
- {
71
- "region": "us-west-2",
72
- "instanceType": "g4dn.12xlarge",
73
- "cloud": "AWS",
74
- "gpu": "4xNVIDIA T4",
75
- "gpuRAM": "64 GB",
76
- "quantization": "bitsandbytes-nf4",
77
- "container": "TGI 2.2.0",
78
- "status": "KO",
79
- "tokensPerSecond": "-",
80
- "notes": "Flash Attention requires Ampere GPUs or newer",
81
- },
82
- {
83
- "region": "us-west-2",
84
- "instanceType": "g5.12xlarge",
85
- "cloud": "AWS",
86
- "gpu": "4xNVIDIA A10G",
87
- "gpuRAM": "96 GB",
88
- "configurations": [
89
- {
90
- "quantization": "bitsandbytes-nf4",
91
- "container": "TGI 2.2.0",
92
- "status": "OK",
93
- "tokensPerSecond": "12",
94
- },
95
- {
96
- "quantization": "bitsandbytes-fp4",
97
- "container": "TGI 2.2.0",
98
- "status": "OK",
99
- "tokensPerSecond": "12",
100
- },
101
- {
102
- "quantization": "bitsandbytes (int8)",
103
- "container": "TGI 2.2.0",
104
- "status": "KO",
105
- "tokensPerSecond": "-",
106
- "notes": "CUDA OOM",
107
- },
108
- {
109
- "quantization": "eetq (int8)",
110
- "container": "TGI 2.2.0",
111
- "status": "KO",
112
- "tokensPerSecond": "-",
113
- "notes": "[FT Error] Heurisitc failed to find a valid config.",
114
- },
115
- ],
116
- },
117
- {
118
- "region": "us-west-2",
119
- "instanceType": "g5.48xlarge",
120
- "cloud": "AWS",
121
- "gpu": "8xNVIDIA A10G",
122
- "gpuRAM": "192 GB",
123
- "configurations": [
124
- {
125
- "quantization": "none",
126
- "container": "TGI 2.2.0",
127
- "status": "KO",
128
- "tokensPerSecond": "-",
129
- "notes": "CUDA OOM (but g6.48xlarge works!)",
130
- },
131
- {
132
- "quantization": "bitsandbytes-nf4",
133
- "container": "TGI 2.2.0",
134
- "status": "OK",
135
- "tokensPerSecond": "12.3",
136
- },
137
- {
138
- "quantization": "bitsandbytes-fp4",
139
- "container": "TGI 2.2.0",
140
- "status": "OK",
141
- "tokensPerSecond": "12.5",
142
- },
143
- {
144
- "quantization": "bitsandbytes (int8)",
145
- "container": "TGI 2.2.0",
146
- "status": "KO",
147
- "tokensPerSecond": "-",
148
- "notes": "The model deploys, but inference times out.",
149
- },
150
- ],
151
- },
152
- {
153
- "region": "us-west-2",
154
- "instanceType": "g6.12xlarge",
155
- "cloud": "AWS",
156
- "gpu": "4xNVIDIA L4",
157
- "gpuRAM": "96 GB",
158
- "configurations": [
159
- {
160
- "quantization": "bitsandbytes-nf4",
161
- "container": "TGI 2.2.0",
162
- "status": "OK",
163
- "tokensPerSecond": "1.5-2",
164
- "notes": "Too slow, timeouts are likely",
165
- },
166
- {
167
- "quantization": "bitsandbytes-fp4",
168
- "container": "TGI 2.2.0",
169
- "status": "OK",
170
- "tokensPerSecond": "2",
171
- "notes": "Too slow, timeouts are likely",
172
- },
173
- {
174
- "quantization": "bitsandbytes (int8)",
175
- "container": "TGI 2.2.0",
176
- "status": "KO",
177
- "tokensPerSecond": "-",
178
- "notes": "CUDA OOM",
179
- },
180
- ],
181
- },
182
- {
183
- "region": "us-west-2",
184
- "instanceType": "g6.48xlarge",
185
- "cloud": "AWS",
186
- "gpu": "8xNVIDIA L4",
187
- "gpuRAM": "192 GB",
188
- "quantization": "none",
189
- "container": "TGI 2.2.0",
190
- "status": "OK",
191
- "tokensPerSecond": "12",
192
- },
193
- {
194
- "region": "us-west-2",
195
- "instanceType": "p4d.24xlarge",
196
- "cloud": "AWS",
197
- "gpu": "8xNVIDIA A100",
198
- "gpuRAM": "320 GB",
199
- "quantization": "none",
200
- "container": "TGI 2.2.0",
201
- "status": "OK",
202
- "tokensPerSecond": "40",
203
- "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
204
- },
205
- {
206
- "region": "us-west-2",
207
- "instanceType": "p4de.24xlarge",
208
- "cloud": "AWS",
209
- "gpu": "8xNVIDIA A100",
210
- "gpuRAM": "320 GB",
211
- "quantization": "none",
212
- "container": "TGI 2.2.0",
213
- "status": "waiting for quota",
214
- },
215
- {
216
- "region": "us-west-2",
217
- "instanceType": "p5.48xlarge",
218
- "cloud": "AWS",
219
- "gpu": "8xNVIDIA H100",
220
- "gpuRAM": "640GB",
221
- "quantization": "none",
222
- "container": "TGI 2.2.0",
223
- "status": "OK",
224
- "tokensPerSecond": "58",
225
- "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
226
- },
227
- {
228
- "region": "us-west-2",
229
- "instanceType": "inf2.*",
230
- "cloud": "AWS",
231
- "gpu": "-",
232
- "container": "TGI 2.2.0",
233
- "status": "not supported",
234
- "tokensPerSecond": "-",
235
- "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
236
- },
237
- ],
238
- },
239
- {
240
- "name": "Llama-Spark",
241
- "modelType": "Llama 3.1 8B",
242
- "configurations": [
243
- {
244
- "region": "AWS",
245
- "instanceType": "g5.2xlarge",
246
- "cloud": "AWS",
247
- "gpu": "1xNVIDIA A10G",
248
- "gpuRAM": "24 GB",
249
- "quantization": "none",
250
- "container": "TGI 2.2.0",
251
- "status": "OK",
252
- "tokensPerSecond": "29",
253
- "notes": "4K/8K fails",
254
- },
255
- {
256
- "region": "AWS",
257
- "instanceType": "g5.12xlarge",
258
- "cloud": "AWS",
259
- "gpu": "4xNVIDIA A10G",
260
- "gpuRAM": "96 GB",
261
- "quantization": "none",
262
- "container": "TGI 2.2.0",
263
- "status": "OK",
264
- "tokensPerSecond": "85",
265
- "notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
266
- },
267
- {
268
- "region": "AWS",
269
- "instanceType": "g5.48xlarge",
270
- "cloud": "AWS",
271
- "gpu": "8xNVIDIA A10G",
272
- "gpuRAM": "192 GB",
273
- "quantization": "none",
274
- "container": "TGI 2.2.0",
275
- "status": "OK",
276
- "tokensPerSecond": "105",
277
- "notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
278
- },
279
- {
280
- "region": "AWS",
281
- "instanceType": "g6.2xlarge",
282
- "cloud": "AWS",
283
- "gpu": "1xNVIDIA L4",
284
- "gpuRAM": "24 GB",
285
- "configurations": [
286
- {
287
- "quantization": "none",
288
- "container": "TGI 2.2.0",
289
- "status": "OK",
290
- "tokensPerSecond": "15",
291
- },
292
- {"quantization": "fp8", "container": "TGI 2.2.0"},
293
- ],
294
- },
295
- {
296
- "region": "AWS",
297
- "instanceType": "g6.12xlarge",
298
- "cloud": "AWS",
299
- "gpu": "4xNVIDIA L4",
300
- "gpuRAM": "96 GB",
301
- "quantization": "none",
302
- "container": "TGI 2.2.0",
303
- "status": "OK",
304
- "tokensPerSecond": "51",
305
- "notes": "same as g5?",
306
- },
307
- {
308
- "region": "AWS",
309
- "instanceType": "g6.48xlarge",
310
- "cloud": "AWS",
311
- "gpu": "8xNVIDIA L4",
312
- "gpuRAM": "192 GB",
313
- "quantization": "none",
314
- "container": "TGI 2.2.0",
315
- "status": "OK",
316
- "tokensPerSecond": "81",
317
- "notes": "same as g5?",
318
- },
319
- {
320
- "region": "AWS",
321
- "instanceType": "g6e.2xlarge",
322
- "cloud": "AWS",
323
- "gpu": "1xNVIDIA L40S",
324
- "gpuRAM": "48 GB",
325
- "quantization": "none",
326
- "container": "TGI 2.2.0",
327
- "status": "OK",
328
- "tokensPerSecond": "42.1",
329
- },
330
- {
331
- "region": "AWS",
332
- "instanceType": "g6e.2xlarge",
333
- "cloud": "AWS",
334
- "gpu": "1xNVIDIA L40S",
335
- "gpuRAM": "48 GB",
336
- "quantization": "none",
337
- "container": "SGLang 0.2.13",
338
- "status": "OK",
339
- "tokensPerSecond": "45",
340
- },
341
- {
342
- "region": "AWS",
343
- "instanceType": "g6e.2xlarge",
344
- "cloud": "AWS",
345
- "gpu": "1xNVIDIA L40S",
346
- "gpuRAM": "48 GB",
347
- "quantization": "none",
348
- "container": "vLLM 0.5.5",
349
- "status": "OK",
350
- "tokensPerSecond": "43.4",
351
- },
352
- {
353
- "region": "AWS",
354
- "instanceType": "p4d.24xlarge",
355
- "cloud": "AWS",
356
- "gpu": "4xNVIDIA A100",
357
- "gpuRAM": "320 GB",
358
- "quantization": "none",
359
- "container": "TGI 2.2.0",
360
- "status": "OK",
361
- "tokensPerSecond": "145",
362
- "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
363
- },
364
- {
365
- "region": "AWS",
366
- "instanceType": "inf2.*",
367
- "cloud": "AWS",
368
- "gpu": "-",
369
- "status": "not supported",
370
- "tokensPerSecond": "-",
371
- "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO",
372
- },
373
- ],
374
- },
375
- {
376
- "name": "Arcee-Agent",
377
- "modelType": "Qwen2 7B",
378
- "notes": "",
379
- "configurations": [
380
- {
381
- "region": "us-west-2",
382
- "instanceType": "g5.2xlarge",
383
- "cloud": "AWS",
384
- "gpu": "1xNVIDIA A10G",
385
- "gpuRAM": "24 GB",
386
- "quantization": "none",
387
- "container": "TGI 2.2.0",
388
- "status": "OK",
389
- "tokensPerSecond": "30",
390
- },
391
- {
392
- "region": "us-west-2",
393
- "instanceType": "g5.12xlarge",
394
- "cloud": "AWS",
395
- "gpu": "4xNVIDIA A10G",
396
- "gpuRAM": "96 GB",
397
- "quantization": "none",
398
- "container": "TGI 2.2.0",
399
- "status": "OK",
400
- "tokensPerSecond": "83",
401
- },
402
- {
403
- "region": "us-west-2",
404
- "instanceType": "g5.48xlarge",
405
- "cloud": "AWS",
406
- "gpu": "8xNVIDIA A10G",
407
- "gpuRAM": "192 GB",
408
- "quantization": "none",
409
- "container": "TGI 2.2.0",
410
- "status": "KO",
411
- "tokensPerSecond": "-",
412
- "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
413
- },
414
- {
415
- "region": "us-west-2",
416
- "instanceType": "g6.2xlarge",
417
- "cloud": "AWS",
418
- "gpu": "1xNVIDIA L4",
419
- "gpuRAM": "24 GB",
420
- "quantization": "none",
421
- "container": "TGI 2.2.0",
422
- "status": "OK",
423
- "tokensPerSecond": "16.3",
424
- },
425
- {
426
- "region": "us-west-2",
427
- "instanceType": "g6.12xlarge",
428
- "cloud": "AWS",
429
- "gpu": "4xNVIDIA L4",
430
- "gpuRAM": "96 GB",
431
- "quantization": "none",
432
- "container": "TGI 2.2.0",
433
- "status": "OK",
434
- "tokensPerSecond": "54.2",
435
- },
436
- {
437
- "region": "us-west-2",
438
- "instanceType": "inf2.*",
439
- "cloud": "AWS",
440
- "gpu": "-",
441
- "container": "TGI 2.2.0",
442
- "status": "not supported",
443
- "tokensPerSecond": "-",
444
- "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
445
- },
446
- {
447
- "region": "us-west-2",
448
- "instanceType": "g6e.2xlarge",
449
- "cloud": "AWS",
450
- "gpu": "1xNVIDIA L40S",
451
- "gpuRAM": "48 GB",
452
- "quantization": "none",
453
- "container": "TGI 2.2.0",
454
- "status": "OK",
455
- "tokensPerSecond": "45",
456
- },
457
- {
458
- "region": "us-west-2",
459
- "instanceType": "g6e.2xlarge",
460
- "cloud": "AWS",
461
- "gpu": "1xNVIDIA L40S",
462
- "gpuRAM": "48 GB",
463
- "quantization": "none",
464
- "container": "SGLang 0.2.13",
465
- "status": "OK",
466
- "tokensPerSecond": "48",
467
- },
468
- {
469
- "region": "us-west-2",
470
- "instanceType": "g6e.2xlarge",
471
- "cloud": "AWS",
472
- "gpu": "1xNVIDIA L40S",
473
- "gpuRAM": "48 GB",
474
- "quantization": "none",
475
- "container": "vLLM 0.5.5",
476
- "status": "OK",
477
- "tokensPerSecond": "45.7",
478
- },
479
- ],
480
- },
481
- {"name": "Arcee-Spark", "modelType": "Qwen2 7B"},
482
- {
483
- "name": "Arcee-Lite",
484
- "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
485
- "configurations": [
486
- {
487
- "region": "us-west-2",
488
- "instanceType": "c6i.xlarge",
489
- "cloud": "AWS",
490
- "gpu": "-",
491
- "gpuRAM": "-",
492
- "quantization": "bitsandbytes-nf4",
493
- "container": "TGI 2.2.0",
494
- "status": "KO",
495
- "tokensPerSecond": "-",
496
- "notes": "OOM, might work with a prequantized model",
497
- },
498
- {
499
- "region": "us-west-2",
500
- "instanceType": "c6i.2xlarge",
501
- "cloud": "AWS",
502
- "gpu": "-",
503
- "gpuRAM": "-",
504
- "quantization": "bitsandbytes-nf4",
505
- "container": "TGI 2.2.0",
506
- "status": "KO",
507
- "tokensPerSecond": "-",
508
- "notes": "OOM, might work with a prequantized model",
509
- },
510
- {
511
- "region": "us-west-2",
512
- "instanceType": "c6i.4xlarge",
513
- "cloud": "AWS",
514
- "gpu": "-",
515
- "gpuRAM": "-",
516
- "configurations": [
517
- {
518
- "quantization": "none",
519
- "container": "TGI 2.2.0",
520
- "status": "OK",
521
- "tokensPerSecond": "10.7",
522
- },
523
- {
524
- "quantization": "bitsandbytes (int8)",
525
- "container": "TGI 2.2.0",
526
- "status": "OK",
527
- "tokensPerSecond": "10.5",
528
- },
529
- {
530
- "quantization": "bitsandbytes-nf4",
531
- "container": "TGI 2.2.0",
532
- "status": "OK",
533
- "tokensPerSecond": "10.6",
534
- },
535
- ],
536
- },
537
- {
538
- "region": "us-west-2",
539
- "instanceType": "c7i.4xlarge",
540
- "cloud": "AWS",
541
- "gpu": "-",
542
- "gpuRAM": "-",
543
- "quantization": "none",
544
- "container": "TGI 2.2.0",
545
- "status": "waiting for quota",
546
- "tokensPerSecond": "-",
547
- },
548
- {
549
- "region": "us-west-2",
550
- "instanceType": "g5.xlarge",
551
- "cloud": "AWS",
552
- "gpu": "1xNVIDIA A10G",
553
- "gpuRAM": "24 GB",
554
- "configurations": [
555
- {
556
- "quantization": "none",
557
- "container": "TGI 2.2.0",
558
- "status": "OK",
559
- "tokensPerSecond": "110",
560
- },
561
- {
562
- "quantization": "none",
563
- "container": "DJL 0.28 vLLM",
564
- "status": "OK",
565
- "tokensPerSecond": "105",
566
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",',
567
- },
568
- ],
569
- },
570
- {
571
- "region": "us-west-2",
572
- "instanceType": "g6e.2xlarge",
573
- "cloud": "AWS",
574
- "gpu": "1xNVIDIA L40S",
575
- "gpuRAM": "48 GB",
576
- "quantization": "none",
577
- "container": "TGI 2.2.0",
578
- "status": "OK",
579
- "tokensPerSecond": "160",
580
- },
581
- {
582
- "region": "us-west-2",
583
- "instanceType": "g6e.2xlarge",
584
- "cloud": "AWS",
585
- "gpu": "1xNVIDIA L40S",
586
- "gpuRAM": "48 GB",
587
- "quantization": "none",
588
- "container": "vLLM 0.5.5",
589
- "status": "OK",
590
- "tokensPerSecond": "146",
591
- },
592
- {
593
- "region": "us-west-2",
594
- "instanceType": "g6e.2xlarge",
595
- "cloud": "AWS",
596
- "gpu": "1xNVIDIA L40S",
597
- "gpuRAM": "48 GB",
598
- "quantization": "none",
599
- "container": "SGLang 0.2.13",
600
- "status": "OK",
601
- "tokensPerSecond": "167",
602
- },
603
- ],
604
- },
605
- {
606
- "name": "Arcee-Scribe",
607
- "modelType": "InternLM2.5 8B",
608
- "configurations": [
609
- {
610
- "cloud": "AWS",
611
- "instanceType": "g5.2xlarge",
612
- "gpu": "1xNVIDIA A10G",
613
- "gpuRAM": "24 GB",
614
- "quantization": "none",
615
- "container": "DJL 0.28 vLLM",
616
- "status": "OK",
617
- "tokensPerSecond": 29,
618
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
619
- },
620
- {
621
- "cloud": "AWS",
622
- "instanceType": "g5.12xlarge",
623
- "gpu": "4xNVIDIA A10G",
624
- "gpuRAM": "96 GB",
625
- "quantization": "none",
626
- "container": "DJL 0.28 vLLM",
627
- "status": "OK",
628
- "tokensPerSecond": 65,
629
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
630
- },
631
- {
632
- "cloud": "AWS",
633
- "instanceType": "g5.48xlarge",
634
- "gpu": "8xNVIDIA A10G",
635
- "gpuRAM": "192 GB",
636
- "quantization": "none",
637
- "container": "DJL 0.28 vLLM",
638
- "status": "OK",
639
- "tokensPerSecond": 80,
640
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
641
- },
642
- {
643
- "cloud": "AWS",
644
- "instanceType": "g6.2xlarge",
645
- "gpu": "1xNVIDIA L4",
646
- "gpuRAM": "24 GB",
647
- "quantization": "none",
648
- "container": "DJL 0.28 vLLM",
649
- "status": "OK",
650
- "tokensPerSecond": 16,
651
- "notes": '"OPTION_MAX_MODEL_LEN": "4096"',
652
- },
653
- {
654
- "cloud": "AWS",
655
- "instanceType": "g6.12xlarge",
656
- "gpu": "4xNVIDIA L4",
657
- "gpuRAM": "96 GB",
658
- "quantization": "none",
659
- "container": "DJL 0.28 vLLM",
660
- "status": "OK",
661
- "tokensPerSecond": 50,
662
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
663
- },
664
- {
665
- "cloud": "AWS",
666
- "instanceType": "g6.48xlarge",
667
- "gpu": "8xNVIDIA L4",
668
- "gpuRAM": "192 GB",
669
- "quantization": "none",
670
- "container": "DJL 0.28 vLLM",
671
- "status": "OK",
672
- "tokensPerSecond": 69,
673
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
674
- },
675
- {
676
- "cloud": "AWS",
677
- "instanceType": "g6e.2xlarge",
678
- "gpu": "1xNVIDIA L40S",
679
- "gpuRAM": "48 GB",
680
- "quantization": "none",
681
- "container": "SGLang 0.2.13",
682
- "status": "OK",
683
- "tokensPerSecond": 46,
684
- },
685
- {
686
- "cloud": "AWS",
687
- "instanceType": "p4d.24xlarge",
688
- "gpu": "4xNVIDIA A100",
689
- "gpuRAM": "320 GB",
690
- "quantization": "none",
691
- "container": "DJL 0.28 vLLM",
692
- "status": "OK",
693
- "tokensPerSecond": 82,
694
- "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
695
- },
696
- ],
697
- },
698
  ]
699
  }
 
1
  """Module containing model configuration results for various AI models and hardware setups."""
2
 
3
+ from results_arcee_agent import results_arcee_agent
4
+ from results_arcee_lite import results_arcee_lite
5
+ from results_arcee_meraj import results_arcee_meraj
6
+ from results_arcee_nova import results_arcee_nova
7
+ from results_arcee_scribe import results_arcee_scribe
8
+ from results_arcee_spark import results_arcee_spark
9
+ from results_arcee_supernova import results_arcee_supernova
10
+ from results_llama_spark import results_llama_spark
11
+
12
+ instance_type_mappings = {
13
+ "g5.xlarge": {"cloud": "AWS", "gpu": "1xNVIDIA A10G", "gpuRAM": "24 GB"},
14
+ "g5.2xlarge": {"cloud": "AWS", "gpu": "1xNVIDIA A10G", "gpuRAM": "24 GB"},
15
+ "g5.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA A10G", "gpuRAM": "96 GB"},
16
+ "g5.48xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA A10G", "gpuRAM": "192 GB"},
17
+ "g6.2xlarge": {"cloud": "AWS", "gpu": "1xNVIDIA L4", "gpuRAM": "24 GB"},
18
+ "g6.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA L4", "gpuRAM": "96 GB"},
19
+ "g6.48xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA L4", "gpuRAM": "192 GB"},
20
+ "g6e.2xlarge": {"cloud": "AWS", "gpu": "1xNVIDIA L40S", "gpuRAM": "48 GB"},
21
+ "g4dn.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA T4", "gpuRAM": "64 GB"},
22
+ "p4d.24xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA A100", "gpuRAM": "320 GB"},
23
+ "p4de.24xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA A100", "gpuRAM": "320 GB"},
24
+ "p5.48xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA H100", "gpuRAM": "640GB"},
25
+ "c6i.xlarge": {"cloud": "AWS", "gpu": "-", "gpuRAM": "-"},
26
+ "c6i.2xlarge": {"cloud": "AWS", "gpu": "-", "gpuRAM": "-"},
27
+ "c6i.4xlarge": {"cloud": "AWS", "gpu": "-", "gpuRAM": "-"},
28
+ "c7i.4xlarge": {"cloud": "AWS", "gpu": "-", "gpuRAM": "-"},
29
+ "inf2.*": {"cloud": "AWS", "gpu": "-", "gpuRAM": "-"},
30
+ }
31
+
32
  results = {
33
  "models": [
34
+ results_arcee_meraj,
35
+ results_arcee_supernova,
36
+ results_arcee_nova,
37
+ results_llama_spark,
38
+ results_arcee_agent,
39
+ results_arcee_spark,
40
+ results_arcee_lite,
41
+ results_arcee_scribe,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  ]
43
  }
results_arcee_agent.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module containing performance results for the Arcee-Agent model."""
2
+
3
+ results_arcee_agent = {
4
+ "name": "Arcee-Agent",
5
+ "modelType": "Qwen2 7B",
6
+ "notes": "",
7
+ "configurations": [
8
+ {
9
+ "instanceType": "g5.2xlarge",
10
+ "quantization": "none",
11
+ "container": "TGI 2.2.0",
12
+ "status": "OK",
13
+ "tokensPerSecond": "30",
14
+ },
15
+ {
16
+ "instanceType": "g5.12xlarge",
17
+ "quantization": "none",
18
+ "container": "TGI 2.2.0",
19
+ "status": "OK",
20
+ "tokensPerSecond": "83",
21
+ },
22
+ {
23
+ "instanceType": "g5.48xlarge",
24
+ "quantization": "none",
25
+ "container": "TGI 2.2.0",
26
+ "status": "KO",
27
+ "tokensPerSecond": "-",
28
+ "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
29
+ },
30
+ {
31
+ "instanceType": "g6.2xlarge",
32
+ "quantization": "none",
33
+ "container": "TGI 2.2.0",
34
+ "status": "OK",
35
+ "tokensPerSecond": "16.3",
36
+ },
37
+ {
38
+ "instanceType": "g6.12xlarge",
39
+ "quantization": "none",
40
+ "container": "TGI 2.2.0",
41
+ "status": "OK",
42
+ "tokensPerSecond": "54.2",
43
+ },
44
+ {
45
+ "instanceType": "inf2.*",
46
+ "container": "TGI 2.2.0",
47
+ "status": "not supported",
48
+ "tokensPerSecond": "-",
49
+ "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
50
+ },
51
+ {
52
+ "instanceType": "g6e.2xlarge",
53
+ "configurations": [
54
+ {
55
+ "container": "TGI 2.2.0",
56
+ "quantization": "none",
57
+ "status": "OK",
58
+ "tokensPerSecond": "45",
59
+ },
60
+ {
61
+ "container": "SGLang 0.2.13",
62
+ "quantization": "none",
63
+ "status": "OK",
64
+ "tokensPerSecond": "48",
65
+ },
66
+ {
67
+ "container": "vLLM 0.5.5",
68
+ "quantization": "none",
69
+ "status": "OK",
70
+ "tokensPerSecond": "45.7",
71
+ },
72
+ ],
73
+ },
74
+ ],
75
+ }
results_arcee_lite.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module containing performance results for the Arcee-Lite model."""
2
+
3
+ results_arcee_lite = {
4
+ "name": "Arcee-Lite",
5
+ "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
6
+ "configurations": [
7
+ {
8
+ "instanceType": "c6i.xlarge",
9
+ "quantization": "bitsandbytes-nf4",
10
+ "container": "TGI 2.2.0",
11
+ "status": "KO",
12
+ "tokensPerSecond": "-",
13
+ "notes": "OOM, might work with a prequantized model",
14
+ },
15
+ {
16
+ "instanceType": "c6i.2xlarge",
17
+ "quantization": "bitsandbytes-nf4",
18
+ "container": "TGI 2.2.0",
19
+ "status": "KO",
20
+ "tokensPerSecond": "-",
21
+ "notes": "OOM, might work with a prequantized model",
22
+ },
23
+ {
24
+ "instanceType": "c6i.4xlarge",
25
+ "configurations": [
26
+ {
27
+ "quantization": "none",
28
+ "container": "TGI 2.2.0",
29
+ "status": "OK",
30
+ "tokensPerSecond": "10.7",
31
+ },
32
+ {
33
+ "quantization": "bitsandbytes (int8)",
34
+ "container": "TGI 2.2.0",
35
+ "status": "OK",
36
+ "tokensPerSecond": "10.5",
37
+ },
38
+ {
39
+ "quantization": "bitsandbytes-nf4",
40
+ "container": "TGI 2.2.0",
41
+ "status": "OK",
42
+ "tokensPerSecond": "10.6",
43
+ },
44
+ ],
45
+ },
46
+ {
47
+ "instanceType": "c7i.4xlarge",
48
+ "quantization": "none",
49
+ "container": "TGI 2.2.0",
50
+ "status": "waiting for quota",
51
+ "tokensPerSecond": "-",
52
+ },
53
+ {
54
+ "instanceType": "g5.xlarge",
55
+ "configurations": [
56
+ {
57
+ "quantization": "none",
58
+ "container": "TGI 2.2.0",
59
+ "status": "OK",
60
+ "tokensPerSecond": "110",
61
+ },
62
+ {
63
+ "quantization": "none",
64
+ "container": "DJL 0.28 vLLM",
65
+ "status": "OK",
66
+ "tokensPerSecond": "105",
67
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",',
68
+ },
69
+ ],
70
+ },
71
+ {
72
+ "instanceType": "g6e.2xlarge",
73
+ "configurations": [
74
+ {
75
+ "container": "TGI 2.2.0",
76
+ "quantization": "none",
77
+ "status": "OK",
78
+ "tokensPerSecond": "160",
79
+ },
80
+ {
81
+ "container": "SGLang 0.2.13",
82
+ "quantization": "none",
83
+ "status": "OK",
84
+ "tokensPerSecond": "167",
85
+ },
86
+ {
87
+ "container": "vLLM 0.5.5",
88
+ "quantization": "none",
89
+ "status": "OK",
90
+ "tokensPerSecond": "150",
91
+ },
92
+ ],
93
+ },
94
+ ],
95
+ }
results_arcee_meraj.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module containing performance results for the Arcee-Meraj model."""
2
+
3
+ results_arcee_meraj = {
4
+ "name": "Arcee-Meraj",
5
+ "modelType": "Qwen2 72B",
6
+ "configurations": [
7
+ {
8
+ "instanceType": "g5.12xlarge",
9
+ "quantization": "awq",
10
+ "container": "TGI 2.2.0",
11
+ "status": "OK",
12
+ "tokensPerSecond": "33",
13
+ "notes": "",
14
+ },
15
+ {
16
+ "instanceType": "p4d.24xlarge",
17
+ "quantization": "none",
18
+ "container": "TGI 2.2.0",
19
+ "status": "OK",
20
+ "tokensPerSecond": "38",
21
+ "notes": "",
22
+ },
23
+ ],
24
+ }
results_arcee_nova.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module containing performance results for the Arcee-Nova model."""
2
+
3
+ results_arcee_nova = {
4
+ "name": "Arcee-Nova",
5
+ "modelType": "Qwen2 72B",
6
+ "notes": "",
7
+ "configurations": [
8
+ {
9
+ "instanceType": "g4dn.12xlarge",
10
+ "quantization": "bitsandbytes-nf4",
11
+ "container": "TGI 2.2.0",
12
+ "status": "KO",
13
+ "tokensPerSecond": "-",
14
+ "notes": "Flash Attention requires Ampere GPUs or newer",
15
+ },
16
+ {
17
+ "instanceType": "g5.12xlarge",
18
+ "configurations": [
19
+ {
20
+ "quantization": "bitsandbytes-nf4",
21
+ "container": "TGI 2.2.0",
22
+ "status": "OK",
23
+ "tokensPerSecond": "12",
24
+ },
25
+ {
26
+ "quantization": "bitsandbytes-fp4",
27
+ "container": "TGI 2.2.0",
28
+ "status": "OK",
29
+ "tokensPerSecond": "12",
30
+ },
31
+ {
32
+ "quantization": "bitsandbytes (int8)",
33
+ "container": "TGI 2.2.0",
34
+ "status": "KO",
35
+ "tokensPerSecond": "-",
36
+ "notes": "CUDA OOM",
37
+ },
38
+ {
39
+ "quantization": "eetq (int8)",
40
+ "container": "TGI 2.2.0",
41
+ "status": "KO",
42
+ "tokensPerSecond": "-",
43
+ "notes": "[FT Error] Heurisitc failed to find a valid config.",
44
+ },
45
+ ],
46
+ },
47
+ {
48
+ "instanceType": "g5.48xlarge",
49
+ "configurations": [
50
+ {
51
+ "quantization": "none",
52
+ "container": "TGI 2.2.0",
53
+ "status": "KO",
54
+ "tokensPerSecond": "-",
55
+ "notes": "CUDA OOM (but g6.48xlarge works!)",
56
+ },
57
+ {
58
+ "quantization": "bitsandbytes-nf4",
59
+ "container": "TGI 2.2.0",
60
+ "status": "OK",
61
+ "tokensPerSecond": "12.3",
62
+ },
63
+ {
64
+ "quantization": "bitsandbytes-fp4",
65
+ "container": "TGI 2.2.0",
66
+ "status": "OK",
67
+ "tokensPerSecond": "12.5",
68
+ },
69
+ {
70
+ "quantization": "bitsandbytes (int8)",
71
+ "container": "TGI 2.2.0",
72
+ "status": "KO",
73
+ "tokensPerSecond": "-",
74
+ "notes": "The model deploys, but inference times out.",
75
+ },
76
+ ],
77
+ },
78
+ {
79
+ "instanceType": "g6.12xlarge",
80
+ "configurations": [
81
+ {
82
+ "quantization": "bitsandbytes-nf4",
83
+ "container": "TGI 2.2.0",
84
+ "status": "OK",
85
+ "tokensPerSecond": "1.5-2",
86
+ "notes": "Too slow, timeouts are likely",
87
+ },
88
+ {
89
+ "quantization": "bitsandbytes-fp4",
90
+ "container": "TGI 2.2.0",
91
+ "status": "OK",
92
+ "tokensPerSecond": "2",
93
+ "notes": "Too slow, timeouts are likely",
94
+ },
95
+ {
96
+ "quantization": "bitsandbytes (int8)",
97
+ "container": "TGI 2.2.0",
98
+ "status": "KO",
99
+ "tokensPerSecond": "-",
100
+ "notes": "CUDA OOM",
101
+ },
102
+ ],
103
+ },
104
+ {
105
+ "instanceType": "g6.48xlarge",
106
+ "quantization": "none",
107
+ "container": "TGI 2.2.0",
108
+ "status": "OK",
109
+ "tokensPerSecond": "12",
110
+ },
111
+ {
112
+ "instanceType": "p4d.24xlarge",
113
+ "quantization": "none",
114
+ "container": "TGI 2.2.0",
115
+ "status": "OK",
116
+ "tokensPerSecond": "40",
117
+ "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
118
+ },
119
+ {
120
+ "instanceType": "p4de.24xlarge",
121
+ "quantization": "none",
122
+ "container": "TGI 2.2.0",
123
+ "status": "waiting for quota",
124
+ },
125
+ {
126
+ "instanceType": "p5.48xlarge",
127
+ "quantization": "none",
128
+ "container": "TGI 2.2.0",
129
+ "status": "OK",
130
+ "tokensPerSecond": "58",
131
+ "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
132
+ },
133
+ {
134
+ "instanceType": "inf2.*",
135
+ "container": "TGI 2.2.0",
136
+ "status": "not supported",
137
+ "tokensPerSecond": "-",
138
+ "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
139
+ },
140
+ ],
141
+ }
results_arcee_scribe.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module containing performance results for the Arcee-Scribe model."""
2
+
3
+ results_arcee_scribe = {
4
+ "name": "Arcee-Scribe",
5
+ "modelType": "InternLM2.5 8B",
6
+ "configurations": [
7
+ {
8
+ "instanceType": "g5.2xlarge",
9
+ "quantization": "none",
10
+ "container": "DJL 0.28 vLLM",
11
+ "status": "OK",
12
+ "tokensPerSecond": 29,
13
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
14
+ },
15
+ {
16
+ "instanceType": "g5.12xlarge",
17
+ "quantization": "none",
18
+ "container": "DJL 0.28 vLLM",
19
+ "status": "OK",
20
+ "tokensPerSecond": 65,
21
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
22
+ },
23
+ {
24
+ "instanceType": "g5.48xlarge",
25
+ "quantization": "none",
26
+ "container": "DJL 0.28 vLLM",
27
+ "status": "OK",
28
+ "tokensPerSecond": 80,
29
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
30
+ },
31
+ {
32
+ "instanceType": "g6.2xlarge",
33
+ "quantization": "none",
34
+ "container": "DJL 0.28 vLLM",
35
+ "status": "OK",
36
+ "tokensPerSecond": 16,
37
+ "notes": '"OPTION_MAX_MODEL_LEN": "4096"',
38
+ },
39
+ {
40
+ "instanceType": "g6.12xlarge",
41
+ "quantization": "none",
42
+ "container": "DJL 0.28 vLLM",
43
+ "status": "OK",
44
+ "tokensPerSecond": 50,
45
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
46
+ },
47
+ {
48
+ "instanceType": "g6.48xlarge",
49
+ "quantization": "none",
50
+ "container": "DJL 0.28 vLLM",
51
+ "status": "OK",
52
+ "tokensPerSecond": 69,
53
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
54
+ },
55
+ {
56
+ "instanceType": "g6e.2xlarge",
57
+ "quantization": "none",
58
+ "container": "SGLang 0.2.13",
59
+ "status": "OK",
60
+ "tokensPerSecond": 46,
61
+ },
62
+ {
63
+ "instanceType": "p4d.24xlarge",
64
+ "quantization": "none",
65
+ "container": "DJL 0.28 vLLM",
66
+ "status": "OK",
67
+ "tokensPerSecond": 82,
68
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
69
+ },
70
+ ],
71
+ }
results_arcee_spark.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """Module containing performance results for the Arcee-Spark model."""
2
+
3
+ results_arcee_spark = {"name": "Arcee-Spark", "modelType": "Qwen2 7B"}
results_arcee_supernova.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module containing performance results for the Arcee-SuperNova model."""
2
+
3
+ results_arcee_supernova = {
4
+ "name": "Arcee-SuperNova",
5
+ "modelType": "Llama 3.1 70B",
6
+ "configurations": [
7
+ {
8
+ "instanceType": "g5.12xlarge",
9
+ "quantization": "awq",
10
+ "container": "TGI 2.2.0",
11
+ "status": "OK",
12
+ "tokensPerSecond": "33",
13
+ "notes": "",
14
+ },
15
+ {
16
+ "instanceType": "p4d.24xlarge",
17
+ "quantization": "none",
18
+ "container": "TGI 2.2.0",
19
+ "status": "OK",
20
+ "tokensPerSecond": "38",
21
+ "notes": "",
22
+ },
23
+ ],
24
+ }
results_llama_spark.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module containing performance results for the Llama-Spark model."""
2
+
3
+ results_llama_spark = {
4
+ "name": "Llama-Spark",
5
+ "modelType": "Llama 3.1 8B",
6
+ "configurations": [
7
+ {
8
+ "instanceType": "g5.2xlarge",
9
+ "quantization": "none",
10
+ "container": "TGI 2.2.0",
11
+ "status": "OK",
12
+ "tokensPerSecond": "29",
13
+ "notes": "4K/8K fails",
14
+ },
15
+ {
16
+ "instanceType": "g5.12xlarge",
17
+ "quantization": "none",
18
+ "container": "TGI 2.2.0",
19
+ "status": "OK",
20
+ "tokensPerSecond": "85",
21
+ "notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
22
+ },
23
+ {
24
+ "instanceType": "g5.48xlarge",
25
+ "quantization": "none",
26
+ "container": "TGI 2.2.0",
27
+ "status": "OK",
28
+ "tokensPerSecond": "105",
29
+ "notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
30
+ },
31
+ {
32
+ "instanceType": "g6.12xlarge",
33
+ "quantization": "none",
34
+ "container": "TGI 2.2.0",
35
+ "status": "OK",
36
+ "tokensPerSecond": "51",
37
+ "notes": "same as g5?",
38
+ },
39
+ {
40
+ "instanceType": "g6.48xlarge",
41
+ "quantization": "none",
42
+ "container": "TGI 2.2.0",
43
+ "status": "OK",
44
+ "tokensPerSecond": "81",
45
+ "notes": "same as g5?",
46
+ },
47
+ {
48
+ "instanceType": "g6e.2xlarge",
49
+ "quantization": "none",
50
+ "status": "OK",
51
+ "configurations": [
52
+ {"container": "TGI 2.2.0", "tokensPerSecond": "42.1"},
53
+ {"container": "SGLang 0.2.13", "tokensPerSecond": "45"},
54
+ {"container": "vLLM 0.5.5", "tokensPerSecond": "43.4"},
55
+ ],
56
+ },
57
+ {
58
+ "instanceType": "p4d.24xlarge",
59
+ "quantization": "none",
60
+ "container": "TGI 2.2.0",
61
+ "status": "OK",
62
+ "tokensPerSecond": "145",
63
+ "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
64
+ },
65
+ {
66
+ "instanceType": "inf2.*",
67
+ "container": "TGI 2.2.0",
68
+ "status": "not supported",
69
+ "tokensPerSecond": "-",
70
+ "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO",
71
+ },
72
+ ],
73
+ }