File size: 3,033 Bytes
10d93f3
 
 
 
 
 
754d71d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534766c
 
 
 
 
 
 
 
4cec6db
 
 
 
 
 
 
 
754d71d
 
 
 
 
 
 
 
585a8ec
bf877db
 
585a8ec
 
754d71d
bf877db
585a8ec
31ed7d9
 
 
 
 
bf877db
 
31ed7d9
4cec6db
 
 
31ed7d9
4cec6db
bf877db
 
4cec6db
10d93f3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""Module containing performance results for the Llama-3-Supernova-Lite model."""

results_llama_supernova_lite = {
    "name": "Llama-3.1-SuperNova-Lite",
    "modelType": "Llama 3.1 8B",
    "configurations": [
        {
            "instanceType": "c7i.4xlarge",
            "configurations": [
                {
                    "quantization": "Q6_K",
                    "container": "llama.cpp 10/18/24",
                    "status": "OK",
                    "tokensPerSecond": "xxx",
                    "notes": "AMX enabled, Flash Attention enabled",
                },
                {
                    "quantization": "Q5_K",
                    "container": "llama.cpp 10/18/24",
                    "status": "OK",
                    "tokensPerSecond": "xxx",
                    "notes": "AMX enabled, Flash Attention enabled",
                },
                {
                    "quantization": "Q4_K",
                    "container": "llama.cpp 10/18/24",
                    "status": "OK",
                    "tokensPerSecond": "xxx",
                    "notes": "AMX enabled, Flash Attention enabled",
                },
                {
                    "quantization": "IQ4_XS",
                    "container": "llama.cpp 10/18/24",
                    "status": "OK",
                    "tokensPerSecond": "xxx",
                    "notes": "AMX enabled, Flash Attention enabled",
                },
            ],
        },
        {
            "instanceType": "c7g.8xlarge",
            "quantization": "Q4_0_8_8",
            "container": "llama.cpp 9/18/24",
            "status": "OK",
            "tokensPerSecond": "39.7",
            "notes": "requantized from Q4_K_S",
        },
        {
            "instanceType": "c7g.16xlarge",
            "quantization": "Q4_0_8_8",
            "container": "llama.cpp 9/18/24",
            "status": "OK",
            "tokensPerSecond": "45.5",
            "notes": "",
        },
        {
            "instanceType": "c8g.4xlarge",
            "quantization": "Q4_0_4_8",
            "container": "llama.cpp 11/05/24",
            "status": "OK",
            "tokensPerSecond": "34",
            "notes": "with Flash Attention",
        },
        {
            "instanceType": "r8g.4xlarge",
            "quantization": "Q4_0_4_8",
            "container": "llama.cpp 9/11/24",
            "status": "OK",
            "tokensPerSecond": "40",
            "notes": "with Flash Attention",
        },
        {
            "instanceType": "r8g.8xlarge",
            "quantization": "Q4_0_4_8",
            "container": "llama.cpp 9/11/24",
            "status": "OK",
            "tokensPerSecond": "63",
            "notes": "with Flash Attention",
        },
        {
            "instanceType": "r8g.16xlarge",
            "quantization": "Q4_0_4_8",
            "container": "llama.cpp 9/11/24",
            "status": "OK",
            "tokensPerSecond": "70",
            "notes": "with Flash Attention",
        },
    ],
}