Spaces:

arcee-ai
/

Benchmarks

Running

App Files Files Community

Julien Simon commited on Sep 4

Commit

0c0f086

•

1 Parent(s): 3fdf87c

Break results into one file per model

Browse files

Files changed (13) hide show

.pre-commit-config.yaml +0 -7
.pylintrc +1 -1
app.py +120 -88
requirements.txt +1 -0
results.py +37 -693
results_arcee_agent.py +75 -0
results_arcee_lite.py +95 -0
results_arcee_meraj.py +24 -0
results_arcee_nova.py +141 -0
results_arcee_scribe.py +71 -0
results_arcee_spark.py +3 -0
results_arcee_supernova.py +24 -0
results_llama_spark.py +73 -0

.pre-commit-config.yaml CHANGED Viewed

@@ -57,13 +57,6 @@ repos:
       - id: cfn-lint
         files: cloudformation/.*\.(json|yml|yaml)$
-  - repo: https://github.com/asottile/pyupgrade
-    rev: v3.17.0
-    hooks:
-      - id: pyupgrade
-        args: [--py310-plus]
-        entry: bash -c 'pyupgrade "$@"; git add -u' --
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.11.2
     hooks:

       - id: cfn-lint
         files: cloudformation/.*\.(json|yml|yaml)$
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.11.2
     hooks:

.pylintrc CHANGED Viewed

	@@ -1,2 +1,2 @@
1	[MESSAGES CONTROL]
2	- disable=C0301,E0401,R0914


1	[MESSAGES CONTROL]
2	+ disable=R0801,C0301,E0401,R0914,R1702

app.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import logging
 import re
@@ -16,7 +21,7 @@ def get_model_names():
     Returns:
         list: Sorted list of model names.
     """
-    return sorted([model['name'] for model in results['models']])
 def get_models_by_architecture(model_name):
@@ -29,12 +34,14 @@ def get_models_by_architecture(model_name):
     Returns:
         list: List of models with the same architecture.
     """
-    selected_model = next((m for m in results['models'] if m['name'] == model_name), None)
     if not selected_model:
         return []
-    model_type = selected_model.get('modelType', '')
-    return [m for m in results['models'] if m.get('modelType', '') == model_type]
 def custom_sort_key(instance_type):
@@ -47,12 +54,24 @@ def custom_sort_key(instance_type):
     Returns:
         tuple: A tuple used for sorting, containing (family, size_index).
     """
-    size_order = ['xlarge', '2xlarge', '4xlarge', '8xlarge', '12xlarge', '16xlarge', '24xlarge', '48xlarge']
-    match = re.match(r'([a-z]+\d+)\.(\w+)', instance_type)
     if match:
         family, size = match.groups()
-        return (family, size_order.index(size) if size in size_order else len(size_order))
     return (instance_type, 0)  # Fallback for non-standard instance types
@@ -71,109 +90,122 @@ def display_results(model_name):
     try:
         models = get_models_by_architecture(model_name)
         if not models:
-            logging.warning(f"No models found for {model_name}")
-            return f"No results found for the selected model: {model_name}", pd.DataFrame()
-        model_type = models[0].get('modelType', 'N/A')
-        data = {}
         merged_models = set()
         for model in models:
-            merged_models.add(model.get('name', 'Unknown'))
-            for config in model.get('configurations', []):
                 try:
-                    cloud = config.get('cloud', 'N/A')
-                    instance_type = config.get('instanceType', 'N/A')
-                    key = (cloud, instance_type)
-                    if 'configurations' in config:
-                        for nested_config in config['configurations']:
-                            nested_key = key + (nested_config.get('quantization', 'N/A'),)
-                            data[nested_key] = {
                                 "Cloud": cloud,
                                 "Instance Type": instance_type,
-                                "GPU": config.get('gpu', 'N/A'),
-                                "GPU RAM": config.get('gpuRAM', 'N/A'),
-                                "Status": nested_config.get('status', 'N/A'),
-                                "Quantization": nested_config.get('quantization', 'N/A'),
-                                "Container": nested_config.get('container', nested_config.get('tgi', 'N/A')),
-                                "Tokens per Second": nested_config.get('tokensPerSecond', 'N/A'),
-                                "Notes": nested_config.get('notes', ''),
                             }
-                    else:
-                        # Generate a unique key for each configuration
-                        unique_key = key + (config.get('quantization', 'N/A'), len(data))
-                        data[unique_key] = {
-                            "Cloud": cloud,
-                            "Instance Type": instance_type,
-                            "GPU": config.get('gpu', 'N/A'),
-                            "GPU RAM": config.get('gpuRAM', 'N/A'),
-                            "Status": config.get('status', 'N/A'),
-                            "Quantization": config.get('quantization', 'N/A'),
-                            "Container": config.get('container', config.get('tgi', 'N/A')),
-                            "Tokens per Second": config.get('tokensPerSecond', 'N/A'),
-                            "Notes": config.get('notes', ''),
-                        }
-                except Exception as e:
-                    print(f"Error processing configuration: {e}")
                     continue
         if not data:
-            logging.warning(f"No data extracted for {model_name}")
-            return f"No data could be extracted for the selected model: {model_name}", pd.DataFrame()
-        # Merge data if there are conflicts
-        for key, value in data.items():
-            for field in value:
-                if value[field] == 'N/A':
-                    for other_key, other_value in data.items():
-                        if other_key[0] == key[0] and other_value[field] != 'N/A':
-                            value[field] = other_value[field]
-                            break
-        # Filter out rows where Status is 'N/A'
-        data = {k: v for k, v in data.items() if v['Status'] != 'N/A'}
-        merged_models_message = f"Note: Results merged from models: {', '.join(merged_models)}" if len(merged_models) > 1 else None
-        # Sort the data by instance type
-        sorted_data = sorted(data.values(), key=lambda x: custom_sort_key(x['Instance Type']))
-        results = f"## Results for {model_name}\n\nModel Type: {model_type}"
         if merged_models_message:
-            results += f"\n\n{merged_models_message}"
         df = pd.DataFrame(sorted_data)
         def color_status(val):
-            if val == 'OK':
-                return 'background-color: green; color: white'
-            elif val == 'KO':
-                return 'background-color: red; color: white'
-            else:
-                return ''
-        styled_df = df.style.applymap(color_status, subset=['Status'])
-        return results, styled_df
-    except Exception as e:
-        logging.exception(f"Error in display_results: {e}")
-        return f"An error occurred while processing results for {model_name}: {str(e)}", pd.DataFrame()
 with gr.Blocks() as demo:
     gr.Markdown("# Model Benchmark Results")
-    gr.Markdown("This table shows the benchmark results for each model. Container settings ([TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher), [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html), etc.) are default unless noted.")
     model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
     results_text = gr.Markdown()
     results_output = gr.DataFrame(label="Results")
     model_dropdown.change(
-        display_results,
-        inputs=[model_dropdown],
-        outputs=[results_text, results_output]
     )
-if __name__ == "__main__":
-    demo.launch()

+"""
+This module provides functionality for displaying and analyzing model benchmark results.
+It includes functions for data processing, sorting, and a Gradio interface for user interaction.
+"""
 import logging
 import re
     Returns:
         list: Sorted list of model names.
     """
+    return sorted([model["name"] for model in results["models"]])
 def get_models_by_architecture(model_name):
     Returns:
         list: List of models with the same architecture.
     """
+    selected_model = next(
+        (m for m in results["models"] if m["name"] == model_name), None
+    )
     if not selected_model:
         return []
+    model_type = selected_model.get("modelType", "")
+    return [m for m in results["models"] if m.get("modelType", "") == model_type]
 def custom_sort_key(instance_type):
     Returns:
         tuple: A tuple used for sorting, containing (family, size_index).
     """
+    size_order = [
+        "xlarge",
+        "2xlarge",
+        "4xlarge",
+        "8xlarge",
+        "12xlarge",
+        "16xlarge",
+        "24xlarge",
+        "48xlarge",
+    ]
+    match = re.match(r"([a-z]+\d+)\.(\w+)", instance_type)
     if match:
         family, size = match.groups()
+        return (
+            family,
+            size_order.index(size) if size in size_order else len(size_order),
+        )
     return (instance_type, 0)  # Fallback for non-standard instance types
     try:
         models = get_models_by_architecture(model_name)
         if not models:
+            logging.warning("No models found for %s", model_name)
+            return (
+                f"No results found for the selected model: {model_name}",
+                pd.DataFrame(),
+            )
+        model_type = models[0].get("modelType", "N/A")
+        data = []
         merged_models = set()
         for model in models:
+            merged_models.add(model.get("name", "Unknown"))
+            for config in model.get("configurations", []):
                 try:
+                    cloud = config.get("cloud", "N/A")
+                    instance_type = config.get("instanceType", "N/A")
+                    if "configurations" in config:
+                        for nested_config in config["configurations"]:
+                            data.append(
+                                {
+                                    "Cloud": cloud,
+                                    "Instance Type": instance_type,
+                                    "GPU": config.get("gpu", "N/A"),
+                                    "GPU RAM": config.get("gpuRAM", "N/A"),
+                                    "Status": nested_config.get("status", "N/A"),
+                                    "Quantization": nested_config.get(
+                                        "quantization", "N/A"
+                                    ),
+                                    "Container": nested_config.get(
+                                        "container",
+                                        nested_config.get("tgi", "N/A"),
+                                    ),
+                                    "Tokens per Second": nested_config.get(
+                                        "tokensPerSecond", "N/A"
+                                    ),
+                                    "Notes": nested_config.get("notes", ""),
+                                }
+                            )
+                    else:
+                        data.append(
+                            {
                                 "Cloud": cloud,
                                 "Instance Type": instance_type,
+                                "GPU": config.get("gpu", "N/A"),
+                                "GPU RAM": config.get("gpuRAM", "N/A"),
+                                "Status": config.get("status", "N/A"),
+                                "Quantization": config.get("quantization", "N/A"),
+                                "Container": config.get(
+                                    "container", config.get("tgi", "N/A")
+                                ),
+                                "Tokens per Second": config.get(
+                                    "tokensPerSecond", "N/A"
+                                ),
+                                "Notes": config.get("notes", ""),
                             }
+                        )
+                except (KeyError, ValueError, TypeError) as e:
+                    logging.error("Error processing configuration: %s", e)
                     continue
         if not data:
+            logging.warning("No data extracted for %s", model_name)
+            return (
+                f"No data for the selected model: {model_name}",
+                pd.DataFrame(),
+            )
+        merged_models_message = (
+            f"Note: Results merged from models: {', '.join(merged_models)}"
+            if len(merged_models) > 1
+            else None
+        )
+        sorted_data = sorted(data, key=lambda x: custom_sort_key(x["Instance Type"]))
+        result_text = f"## Results for {model_name}\n\nModel Type: {model_type}"
         if merged_models_message:
+            result_text += f"\n\n{merged_models_message}"
         df = pd.DataFrame(sorted_data)
         def color_status(val):
+            if val == "OK":
+                return "background-color: green; color: white"
+            if val == "KO":
+                return "background-color: red; color: white"
+            return ""
+        styled_df = df.style.applymap(color_status, subset=["Status"])
+        return result_text, styled_df
+    except (KeyError, ValueError, TypeError) as e:
+        logging.exception("Error in display_results: %s", e)
+        return (
+            f"An error for {model_name}: {str(e)}",
+            pd.DataFrame(),
+        )
 with gr.Blocks() as demo:
     gr.Markdown("# Model Benchmark Results")
+    gr.Markdown(
+        """This table shows the benchmark results for each model. \n
+        [TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher),
+        [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html), etc.) are default unless noted."""
+    )
     model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
     results_text = gr.Markdown()
     results_output = gr.DataFrame(label="Results")
     model_dropdown.change(
+        display_results, inputs=[model_dropdown], outputs=[results_text, results_output]
     )
+    if __name__ == "__main__":
+        demo.launch()

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	gradio


1	gradio
2	+ pandas

results.py CHANGED Viewed

@@ -1,699 +1,43 @@
 """Module containing model configuration results for various AI models and hardware setups."""
 results = {
     "models": [
-        {
-            "name": "Arcee-Meraj",
-            "modelType": "Qwen2 72B",
-            "configurations": [
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g5.12xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA A10G",
-                    "gpuRAM": "96 GB",
-                    "quantization": "awq",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "33",
-                    "notes": "",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "p4d.24xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA A100",
-                    "gpuRAM": "320 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "38",
-                    "notes": "",
-                },
-            ],
-        },
-        {
-            "name": "Arcee-SuperNova",
-            "modelType": "Llama 3.1 70B",
-            "configurations": [
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g5.12xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA A10G",
-                    "gpuRAM": "96 GB",
-                    "quantization": "awq",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "33",
-                    "notes": "",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "p4d.24xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA A100",
-                    "gpuRAM": "320 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "38",
-                    "notes": "",
-                },
-            ],
-        },
-        {
-            "name": "Arcee-Nova",
-            "modelType": "Qwen2 72B",
-            "notes": "",
-            "configurations": [
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g4dn.12xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA T4",
-                    "gpuRAM": "64 GB",
-                    "quantization": "bitsandbytes-nf4",
-                    "container": "TGI 2.2.0",
-                    "status": "KO",
-                    "tokensPerSecond": "-",
-                    "notes": "Flash Attention requires Ampere GPUs or newer",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g5.12xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA A10G",
-                    "gpuRAM": "96 GB",
-                    "configurations": [
-                        {
-                            "quantization": "bitsandbytes-nf4",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "12",
-                        },
-                        {
-                            "quantization": "bitsandbytes-fp4",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "12",
-                        },
-                        {
-                            "quantization": "bitsandbytes (int8)",
-                            "container": "TGI 2.2.0",
-                            "status": "KO",
-                            "tokensPerSecond": "-",
-                            "notes": "CUDA OOM",
-                        },
-                        {
-                            "quantization": "eetq (int8)",
-                            "container": "TGI 2.2.0",
-                            "status": "KO",
-                            "tokensPerSecond": "-",
-                            "notes": "[FT Error] Heurisitc failed to find a valid config.",
-                        },
-                    ],
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g5.48xlarge",
-                    "cloud": "AWS",
-                    "gpu": "8xNVIDIA A10G",
-                    "gpuRAM": "192 GB",
-                    "configurations": [
-                        {
-                            "quantization": "none",
-                            "container": "TGI 2.2.0",
-                            "status": "KO",
-                            "tokensPerSecond": "-",
-                            "notes": "CUDA OOM (but g6.48xlarge works!)",
-                        },
-                        {
-                            "quantization": "bitsandbytes-nf4",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "12.3",
-                        },
-                        {
-                            "quantization": "bitsandbytes-fp4",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "12.5",
-                        },
-                        {
-                            "quantization": "bitsandbytes (int8)",
-                            "container": "TGI 2.2.0",
-                            "status": "KO",
-                            "tokensPerSecond": "-",
-                            "notes": "The model deploys, but inference times out.",
-                        },
-                    ],
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g6.12xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA L4",
-                    "gpuRAM": "96 GB",
-                    "configurations": [
-                        {
-                            "quantization": "bitsandbytes-nf4",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "1.5-2",
-                            "notes": "Too slow, timeouts are likely",
-                        },
-                        {
-                            "quantization": "bitsandbytes-fp4",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "2",
-                            "notes": "Too slow, timeouts are likely",
-                        },
-                        {
-                            "quantization": "bitsandbytes (int8)",
-                            "container": "TGI 2.2.0",
-                            "status": "KO",
-                            "tokensPerSecond": "-",
-                            "notes": "CUDA OOM",
-                        },
-                    ],
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g6.48xlarge",
-                    "cloud": "AWS",
-                    "gpu": "8xNVIDIA L4",
-                    "gpuRAM": "192 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "12",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "p4d.24xlarge",
-                    "cloud": "AWS",
-                    "gpu": "8xNVIDIA A100",
-                    "gpuRAM": "320 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "40",
-                    "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "p4de.24xlarge",
-                    "cloud": "AWS",
-                    "gpu": "8xNVIDIA A100",
-                    "gpuRAM": "320 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "waiting for quota",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "p5.48xlarge",
-                    "cloud": "AWS",
-                    "gpu": "8xNVIDIA H100",
-                    "gpuRAM": "640GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "58",
-                    "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "inf2.*",
-                    "cloud": "AWS",
-                    "gpu": "-",
-                    "container": "TGI 2.2.0",
-                    "status": "not supported",
-                    "tokensPerSecond": "-",
-                    "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
-                },
-            ],
-        },
-        {
-            "name": "Llama-Spark",
-            "modelType": "Llama 3.1 8B",
-            "configurations": [
-                {
-                    "region": "AWS",
-                    "instanceType": "g5.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA A10G",
-                    "gpuRAM": "24 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "29",
-                    "notes": "4K/8K fails",
-                },
-                {
-                    "region": "AWS",
-                    "instanceType": "g5.12xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA A10G",
-                    "gpuRAM": "96 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "85",
-                    "notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
-                },
-                {
-                    "region": "AWS",
-                    "instanceType": "g5.48xlarge",
-                    "cloud": "AWS",
-                    "gpu": "8xNVIDIA A10G",
-                    "gpuRAM": "192 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "105",
-                    "notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
-                },
-                {
-                    "region": "AWS",
-                    "instanceType": "g6.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L4",
-                    "gpuRAM": "24 GB",
-                    "configurations": [
-                        {
-                            "quantization": "none",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "15",
-                        },
-                        {"quantization": "fp8", "container": "TGI 2.2.0"},
-                    ],
-                },
-                {
-                    "region": "AWS",
-                    "instanceType": "g6.12xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA L4",
-                    "gpuRAM": "96 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "51",
-                    "notes": "same as g5?",
-                },
-                {
-                    "region": "AWS",
-                    "instanceType": "g6.48xlarge",
-                    "cloud": "AWS",
-                    "gpu": "8xNVIDIA L4",
-                    "gpuRAM": "192 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "81",
-                    "notes": "same as g5?",
-                },
-                {
-                    "region": "AWS",
-                    "instanceType": "g6e.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L40S",
-                    "gpuRAM": "48 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "42.1",
-                },
-                {
-                    "region": "AWS",
-                    "instanceType": "g6e.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L40S",
-                    "gpuRAM": "48 GB",
-                    "quantization": "none",
-                    "container": "SGLang 0.2.13",
-                    "status": "OK",
-                    "tokensPerSecond": "45",
-                },
-                {
-                    "region": "AWS",
-                    "instanceType": "g6e.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L40S",
-                    "gpuRAM": "48 GB",
-                    "quantization": "none",
-                    "container": "vLLM 0.5.5",
-                    "status": "OK",
-                    "tokensPerSecond": "43.4",
-                },
-                {
-                    "region": "AWS",
-                    "instanceType": "p4d.24xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA A100",
-                    "gpuRAM": "320 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "145",
-                    "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
-                },
-                {
-                    "region": "AWS",
-                    "instanceType": "inf2.*",
-                    "cloud": "AWS",
-                    "gpu": "-",
-                    "status": "not supported",
-                    "tokensPerSecond": "-",
-                    "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO",
-                },
-            ],
-        },
-        {
-            "name": "Arcee-Agent",
-            "modelType": "Qwen2 7B",
-            "notes": "",
-            "configurations": [
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g5.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA A10G",
-                    "gpuRAM": "24 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "30",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g5.12xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA A10G",
-                    "gpuRAM": "96 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "83",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g5.48xlarge",
-                    "cloud": "AWS",
-                    "gpu": "8xNVIDIA A10G",
-                    "gpuRAM": "192 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "KO",
-                    "tokensPerSecond": "-",
-                    "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g6.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L4",
-                    "gpuRAM": "24 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "16.3",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g6.12xlarge",
-                    "cloud": "AWS",
-                    "gpu": "4xNVIDIA L4",
-                    "gpuRAM": "96 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "54.2",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "inf2.*",
-                    "cloud": "AWS",
-                    "gpu": "-",
-                    "container": "TGI 2.2.0",
-                    "status": "not supported",
-                    "tokensPerSecond": "-",
-                    "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g6e.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L40S",
-                    "gpuRAM": "48 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "45",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g6e.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L40S",
-                    "gpuRAM": "48 GB",
-                    "quantization": "none",
-                    "container": "SGLang 0.2.13",
-                    "status": "OK",
-                    "tokensPerSecond": "48",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g6e.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L40S",
-                    "gpuRAM": "48 GB",
-                    "quantization": "none",
-                    "container": "vLLM 0.5.5",
-                    "status": "OK",
-                    "tokensPerSecond": "45.7",
-                },
-            ],
-        },
-        {"name": "Arcee-Spark", "modelType": "Qwen2 7B"},
-        {
-            "name": "Arcee-Lite",
-            "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
-            "configurations": [
-                {
-                    "region": "us-west-2",
-                    "instanceType": "c6i.xlarge",
-                    "cloud": "AWS",
-                    "gpu": "-",
-                    "gpuRAM": "-",
-                    "quantization": "bitsandbytes-nf4",
-                    "container": "TGI 2.2.0",
-                    "status": "KO",
-                    "tokensPerSecond": "-",
-                    "notes": "OOM, might work with a prequantized model",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "c6i.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "-",
-                    "gpuRAM": "-",
-                    "quantization": "bitsandbytes-nf4",
-                    "container": "TGI 2.2.0",
-                    "status": "KO",
-                    "tokensPerSecond": "-",
-                    "notes": "OOM, might work with a prequantized model",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "c6i.4xlarge",
-                    "cloud": "AWS",
-                    "gpu": "-",
-                    "gpuRAM": "-",
-                    "configurations": [
-                        {
-                            "quantization": "none",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "10.7",
-                        },
-                        {
-                            "quantization": "bitsandbytes (int8)",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "10.5",
-                        },
-                        {
-                            "quantization": "bitsandbytes-nf4",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "10.6",
-                        },
-                    ],
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "c7i.4xlarge",
-                    "cloud": "AWS",
-                    "gpu": "-",
-                    "gpuRAM": "-",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "waiting for quota",
-                    "tokensPerSecond": "-",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g5.xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA A10G",
-                    "gpuRAM": "24 GB",
-                    "configurations": [
-                        {
-                            "quantization": "none",
-                            "container": "TGI 2.2.0",
-                            "status": "OK",
-                            "tokensPerSecond": "110",
-                        },
-                        {
-                            "quantization": "none",
-                            "container": "DJL 0.28 vLLM",
-                            "status": "OK",
-                            "tokensPerSecond": "105",
-                            "notes": '"OPTION_MAX_MODEL_LEN": "32768",',
-                        },
-                    ],
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g6e.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L40S",
-                    "gpuRAM": "48 GB",
-                    "quantization": "none",
-                    "container": "TGI 2.2.0",
-                    "status": "OK",
-                    "tokensPerSecond": "160",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g6e.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L40S",
-                    "gpuRAM": "48 GB",
-                    "quantization": "none",
-                    "container": "vLLM 0.5.5",
-                    "status": "OK",
-                    "tokensPerSecond": "146",
-                },
-                {
-                    "region": "us-west-2",
-                    "instanceType": "g6e.2xlarge",
-                    "cloud": "AWS",
-                    "gpu": "1xNVIDIA L40S",
-                    "gpuRAM": "48 GB",
-                    "quantization": "none",
-                    "container": "SGLang 0.2.13",
-                    "status": "OK",
-                    "tokensPerSecond": "167",
-                },
-            ],
-        },
-        {
-            "name": "Arcee-Scribe",
-            "modelType": "InternLM2.5 8B",
-            "configurations": [
-                {
-                    "cloud": "AWS",
-                    "instanceType": "g5.2xlarge",
-                    "gpu": "1xNVIDIA A10G",
-                    "gpuRAM": "24 GB",
-                    "quantization": "none",
-                    "container": "DJL 0.28 vLLM",
-                    "status": "OK",
-                    "tokensPerSecond": 29,
-                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
-                },
-                {
-                    "cloud": "AWS",
-                    "instanceType": "g5.12xlarge",
-                    "gpu": "4xNVIDIA A10G",
-                    "gpuRAM": "96 GB",
-                    "quantization": "none",
-                    "container": "DJL 0.28 vLLM",
-                    "status": "OK",
-                    "tokensPerSecond": 65,
-                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
-                },
-                {
-                    "cloud": "AWS",
-                    "instanceType": "g5.48xlarge",
-                    "gpu": "8xNVIDIA A10G",
-                    "gpuRAM": "192 GB",
-                    "quantization": "none",
-                    "container": "DJL 0.28 vLLM",
-                    "status": "OK",
-                    "tokensPerSecond": 80,
-                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
-                },
-                {
-                    "cloud": "AWS",
-                    "instanceType": "g6.2xlarge",
-                    "gpu": "1xNVIDIA L4",
-                    "gpuRAM": "24 GB",
-                    "quantization": "none",
-                    "container": "DJL 0.28 vLLM",
-                    "status": "OK",
-                    "tokensPerSecond": 16,
-                    "notes": '"OPTION_MAX_MODEL_LEN": "4096"',
-                },
-                {
-                    "cloud": "AWS",
-                    "instanceType": "g6.12xlarge",
-                    "gpu": "4xNVIDIA L4",
-                    "gpuRAM": "96 GB",
-                    "quantization": "none",
-                    "container": "DJL 0.28 vLLM",
-                    "status": "OK",
-                    "tokensPerSecond": 50,
-                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
-                },
-                {
-                    "cloud": "AWS",
-                    "instanceType": "g6.48xlarge",
-                    "gpu": "8xNVIDIA L4",
-                    "gpuRAM": "192 GB",
-                    "quantization": "none",
-                    "container": "DJL 0.28 vLLM",
-                    "status": "OK",
-                    "tokensPerSecond": 69,
-                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
-                },
-                {
-                    "cloud": "AWS",
-                    "instanceType": "g6e.2xlarge",
-                    "gpu": "1xNVIDIA L40S",
-                    "gpuRAM": "48 GB",
-                    "quantization": "none",
-                    "container": "SGLang 0.2.13",
-                    "status": "OK",
-                    "tokensPerSecond": 46,
-                },
-                {
-                    "cloud": "AWS",
-                    "instanceType": "p4d.24xlarge",
-                    "gpu": "4xNVIDIA A100",
-                    "gpuRAM": "320 GB",
-                    "quantization": "none",
-                    "container": "DJL 0.28 vLLM",
-                    "status": "OK",
-                    "tokensPerSecond": 82,
-                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
-                },
-            ],
-        },
     ]
 }

 """Module containing model configuration results for various AI models and hardware setups."""
+from results_arcee_agent import results_arcee_agent
+from results_arcee_lite import results_arcee_lite
+from results_arcee_meraj import results_arcee_meraj
+from results_arcee_nova import results_arcee_nova
+from results_arcee_scribe import results_arcee_scribe
+from results_arcee_spark import results_arcee_spark
+from results_arcee_supernova import results_arcee_supernova
+from results_llama_spark import results_llama_spark
+instance_type_mappings = {
+    "g5.xlarge": {"cloud": "AWS", "gpu": "1xNVIDIA A10G", "gpuRAM": "24 GB"},
+    "g5.2xlarge": {"cloud": "AWS", "gpu": "1xNVIDIA A10G", "gpuRAM": "24 GB"},
+    "g5.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA A10G", "gpuRAM": "96 GB"},
+    "g5.48xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA A10G", "gpuRAM": "192 GB"},
+    "g6.2xlarge": {"cloud": "AWS", "gpu": "1xNVIDIA L4", "gpuRAM": "24 GB"},
+    "g6.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA L4", "gpuRAM": "96 GB"},
+    "g6.48xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA L4", "gpuRAM": "192 GB"},
+    "g6e.2xlarge": {"cloud": "AWS", "gpu": "1xNVIDIA L40S", "gpuRAM": "48 GB"},
+    "g4dn.12xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA T4", "gpuRAM": "64 GB"},
+    "p4d.24xlarge": {"cloud": "AWS", "gpu": "4xNVIDIA A100", "gpuRAM": "320 GB"},
+    "p4de.24xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA A100", "gpuRAM": "320 GB"},
+    "p5.48xlarge": {"cloud": "AWS", "gpu": "8xNVIDIA H100", "gpuRAM": "640GB"},
+    "c6i.xlarge": {"cloud": "AWS", "gpu": "-", "gpuRAM": "-"},
+    "c6i.2xlarge": {"cloud": "AWS", "gpu": "-", "gpuRAM": "-"},
+    "c6i.4xlarge": {"cloud": "AWS", "gpu": "-", "gpuRAM": "-"},
+    "c7i.4xlarge": {"cloud": "AWS", "gpu": "-", "gpuRAM": "-"},
+    "inf2.*": {"cloud": "AWS", "gpu": "-", "gpuRAM": "-"},
+}
 results = {
     "models": [
+        results_arcee_meraj,
+        results_arcee_supernova,
+        results_arcee_nova,
+        results_llama_spark,
+        results_arcee_agent,
+        results_arcee_spark,
+        results_arcee_lite,
+        results_arcee_scribe,
     ]
 }

results_arcee_agent.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""Module containing performance results for the Arcee-Agent model."""
+results_arcee_agent = {
+    "name": "Arcee-Agent",
+    "modelType": "Qwen2 7B",
+    "notes": "",
+    "configurations": [
+        {
+            "instanceType": "g5.2xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "30",
+        },
+        {
+            "instanceType": "g5.12xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "83",
+        },
+        {
+            "instanceType": "g5.48xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "KO",
+            "tokensPerSecond": "-",
+            "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
+        },
+        {
+            "instanceType": "g6.2xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "16.3",
+        },
+        {
+            "instanceType": "g6.12xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "54.2",
+        },
+        {
+            "instanceType": "inf2.*",
+            "container": "TGI 2.2.0",
+            "status": "not supported",
+            "tokensPerSecond": "-",
+            "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
+        },
+        {
+            "instanceType": "g6e.2xlarge",
+            "configurations": [
+                {
+                    "container": "TGI 2.2.0",
+                    "quantization": "none",
+                    "status": "OK",
+                    "tokensPerSecond": "45",
+                },
+                {
+                    "container": "SGLang 0.2.13",
+                    "quantization": "none",
+                    "status": "OK",
+                    "tokensPerSecond": "48",
+                },
+                {
+                    "container": "vLLM 0.5.5",
+                    "quantization": "none",
+                    "status": "OK",
+                    "tokensPerSecond": "45.7",
+                },
+            ],
+        },
+    ],
+}

results_arcee_lite.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""Module containing performance results for the Arcee-Lite model."""
+results_arcee_lite = {
+    "name": "Arcee-Lite",
+    "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
+    "configurations": [
+        {
+            "instanceType": "c6i.xlarge",
+            "quantization": "bitsandbytes-nf4",
+            "container": "TGI 2.2.0",
+            "status": "KO",
+            "tokensPerSecond": "-",
+            "notes": "OOM, might work with a prequantized model",
+        },
+        {
+            "instanceType": "c6i.2xlarge",
+            "quantization": "bitsandbytes-nf4",
+            "container": "TGI 2.2.0",
+            "status": "KO",
+            "tokensPerSecond": "-",
+            "notes": "OOM, might work with a prequantized model",
+        },
+        {
+            "instanceType": "c6i.4xlarge",
+            "configurations": [
+                {
+                    "quantization": "none",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "10.7",
+                },
+                {
+                    "quantization": "bitsandbytes (int8)",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "10.5",
+                },
+                {
+                    "quantization": "bitsandbytes-nf4",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "10.6",
+                },
+            ],
+        },
+        {
+            "instanceType": "c7i.4xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "waiting for quota",
+            "tokensPerSecond": "-",
+        },
+        {
+            "instanceType": "g5.xlarge",
+            "configurations": [
+                {
+                    "quantization": "none",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "110",
+                },
+                {
+                    "quantization": "none",
+                    "container": "DJL 0.28 vLLM",
+                    "status": "OK",
+                    "tokensPerSecond": "105",
+                    "notes": '"OPTION_MAX_MODEL_LEN": "32768",',
+                },
+            ],
+        },
+        {
+            "instanceType": "g6e.2xlarge",
+            "configurations": [
+                {
+                    "container": "TGI 2.2.0",
+                    "quantization": "none",
+                    "status": "OK",
+                    "tokensPerSecond": "160",
+                },
+                {
+                    "container": "SGLang 0.2.13",
+                    "quantization": "none",
+                    "status": "OK",
+                    "tokensPerSecond": "167",
+                },
+                {
+                    "container": "vLLM 0.5.5",
+                    "quantization": "none",
+                    "status": "OK",
+                    "tokensPerSecond": "150",
+                },
+            ],
+        },
+    ],
+}

results_arcee_meraj.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Module containing performance results for the Arcee-Meraj model."""
+results_arcee_meraj = {
+    "name": "Arcee-Meraj",
+    "modelType": "Qwen2 72B",
+    "configurations": [
+        {
+            "instanceType": "g5.12xlarge",
+            "quantization": "awq",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "33",
+            "notes": "",
+        },
+        {
+            "instanceType": "p4d.24xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "38",
+            "notes": "",
+        },
+    ],
+}

results_arcee_nova.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""Module containing performance results for the Arcee-Nova model."""
+results_arcee_nova = {
+    "name": "Arcee-Nova",
+    "modelType": "Qwen2 72B",
+    "notes": "",
+    "configurations": [
+        {
+            "instanceType": "g4dn.12xlarge",
+            "quantization": "bitsandbytes-nf4",
+            "container": "TGI 2.2.0",
+            "status": "KO",
+            "tokensPerSecond": "-",
+            "notes": "Flash Attention requires Ampere GPUs or newer",
+        },
+        {
+            "instanceType": "g5.12xlarge",
+            "configurations": [
+                {
+                    "quantization": "bitsandbytes-nf4",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "12",
+                },
+                {
+                    "quantization": "bitsandbytes-fp4",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "12",
+                },
+                {
+                    "quantization": "bitsandbytes (int8)",
+                    "container": "TGI 2.2.0",
+                    "status": "KO",
+                    "tokensPerSecond": "-",
+                    "notes": "CUDA OOM",
+                },
+                {
+                    "quantization": "eetq (int8)",
+                    "container": "TGI 2.2.0",
+                    "status": "KO",
+                    "tokensPerSecond": "-",
+                    "notes": "[FT Error] Heurisitc failed to find a valid config.",
+                },
+            ],
+        },
+        {
+            "instanceType": "g5.48xlarge",
+            "configurations": [
+                {
+                    "quantization": "none",
+                    "container": "TGI 2.2.0",
+                    "status": "KO",
+                    "tokensPerSecond": "-",
+                    "notes": "CUDA OOM (but g6.48xlarge works!)",
+                },
+                {
+                    "quantization": "bitsandbytes-nf4",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "12.3",
+                },
+                {
+                    "quantization": "bitsandbytes-fp4",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "12.5",
+                },
+                {
+                    "quantization": "bitsandbytes (int8)",
+                    "container": "TGI 2.2.0",
+                    "status": "KO",
+                    "tokensPerSecond": "-",
+                    "notes": "The model deploys, but inference times out.",
+                },
+            ],
+        },
+        {
+            "instanceType": "g6.12xlarge",
+            "configurations": [
+                {
+                    "quantization": "bitsandbytes-nf4",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "1.5-2",
+                    "notes": "Too slow, timeouts are likely",
+                },
+                {
+                    "quantization": "bitsandbytes-fp4",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "2",
+                    "notes": "Too slow, timeouts are likely",
+                },
+                {
+                    "quantization": "bitsandbytes (int8)",
+                    "container": "TGI 2.2.0",
+                    "status": "KO",
+                    "tokensPerSecond": "-",
+                    "notes": "CUDA OOM",
+                },
+            ],
+        },
+        {
+            "instanceType": "g6.48xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "12",
+        },
+        {
+            "instanceType": "p4d.24xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "40",
+            "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
+        },
+        {
+            "instanceType": "p4de.24xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "waiting for quota",
+        },
+        {
+            "instanceType": "p5.48xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "58",
+            "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
+        },
+        {
+            "instanceType": "inf2.*",
+            "container": "TGI 2.2.0",
+            "status": "not supported",
+            "tokensPerSecond": "-",
+            "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
+        },
+    ],
+}

results_arcee_scribe.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""Module containing performance results for the Arcee-Scribe model."""
+results_arcee_scribe = {
+    "name": "Arcee-Scribe",
+    "modelType": "InternLM2.5 8B",
+    "configurations": [
+        {
+            "instanceType": "g5.2xlarge",
+            "quantization": "none",
+            "container": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 29,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
+        },
+        {
+            "instanceType": "g5.12xlarge",
+            "quantization": "none",
+            "container": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 65,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
+        },
+        {
+            "instanceType": "g5.48xlarge",
+            "quantization": "none",
+            "container": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 80,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
+        },
+        {
+            "instanceType": "g6.2xlarge",
+            "quantization": "none",
+            "container": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 16,
+            "notes": '"OPTION_MAX_MODEL_LEN": "4096"',
+        },
+        {
+            "instanceType": "g6.12xlarge",
+            "quantization": "none",
+            "container": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 50,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
+        },
+        {
+            "instanceType": "g6.48xlarge",
+            "quantization": "none",
+            "container": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 69,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
+        },
+        {
+            "instanceType": "g6e.2xlarge",
+            "quantization": "none",
+            "container": "SGLang 0.2.13",
+            "status": "OK",
+            "tokensPerSecond": 46,
+        },
+        {
+            "instanceType": "p4d.24xlarge",
+            "quantization": "none",
+            "container": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 82,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
+        },
+    ],
+}

results_arcee_spark.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """Module containing performance results for the Arcee-Spark model."""
2	+
3	+ results_arcee_spark = {"name": "Arcee-Spark", "modelType": "Qwen2 7B"}

results_arcee_supernova.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Module containing performance results for the Arcee-SuperNova model."""
+results_arcee_supernova = {
+    "name": "Arcee-SuperNova",
+    "modelType": "Llama 3.1 70B",
+    "configurations": [
+        {
+            "instanceType": "g5.12xlarge",
+            "quantization": "awq",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "33",
+            "notes": "",
+        },
+        {
+            "instanceType": "p4d.24xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "38",
+            "notes": "",
+        },
+    ],
+}

results_llama_spark.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""Module containing performance results for the Llama-Spark model."""
+results_llama_spark = {
+    "name": "Llama-Spark",
+    "modelType": "Llama 3.1 8B",
+    "configurations": [
+        {
+            "instanceType": "g5.2xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "29",
+            "notes": "4K/8K fails",
+        },
+        {
+            "instanceType": "g5.12xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "85",
+            "notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
+        },
+        {
+            "instanceType": "g5.48xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "105",
+            "notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
+        },
+        {
+            "instanceType": "g6.12xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "51",
+            "notes": "same as g5?",
+        },
+        {
+            "instanceType": "g6.48xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "81",
+            "notes": "same as g5?",
+        },
+        {
+            "instanceType": "g6e.2xlarge",
+            "quantization": "none",
+            "status": "OK",
+            "configurations": [
+                {"container": "TGI 2.2.0", "tokensPerSecond": "42.1"},
+                {"container": "SGLang 0.2.13", "tokensPerSecond": "45"},
+                {"container": "vLLM 0.5.5", "tokensPerSecond": "43.4"},
+            ],
+        },
+        {
+            "instanceType": "p4d.24xlarge",
+            "quantization": "none",
+            "container": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "145",
+            "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
+        },
+        {
+            "instanceType": "inf2.*",
+            "container": "TGI 2.2.0",
+            "status": "not supported",
+            "tokensPerSecond": "-",
+            "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO",
+        },
+    ],
+}