File size: 2,177 Bytes
0c0f086
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Module containing performance results for the Arcee-Agent model."""

results_arcee_agent = {
    "name": "Arcee-Agent",
    "modelType": "Qwen2 7B",
    "notes": "",
    "configurations": [
        {
            "instanceType": "g5.2xlarge",
            "quantization": "none",
            "container": "TGI 2.2.0",
            "status": "OK",
            "tokensPerSecond": "30",
        },
        {
            "instanceType": "g5.12xlarge",
            "quantization": "none",
            "container": "TGI 2.2.0",
            "status": "OK",
            "tokensPerSecond": "83",
        },
        {
            "instanceType": "g5.48xlarge",
            "quantization": "none",
            "container": "TGI 2.2.0",
            "status": "KO",
            "tokensPerSecond": "-",
            "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
        },
        {
            "instanceType": "g6.2xlarge",
            "quantization": "none",
            "container": "TGI 2.2.0",
            "status": "OK",
            "tokensPerSecond": "16.3",
        },
        {
            "instanceType": "g6.12xlarge",
            "quantization": "none",
            "container": "TGI 2.2.0",
            "status": "OK",
            "tokensPerSecond": "54.2",
        },
        {
            "instanceType": "g6e.2xlarge",
            "configurations": [
                {
                    "container": "TGI 2.2.0",
                    "quantization": "none",
                    "status": "OK",
                    "tokensPerSecond": "45",
                },
                {
                    "container": "SGLang 0.2.13",
                    "quantization": "none",
                    "status": "OK",
                    "tokensPerSecond": "48",
                },
                {
                    "container": "vLLM 0.5.5",
                    "quantization": "none",
                    "status": "OK",
                    "tokensPerSecond": "45.7",
                },
            ],
        },
    ],
}