Spaces:
Sleeping
Sleeping
Delete metrics.py
Browse files- metrics.py +0 -118
metrics.py
DELETED
@@ -1,118 +0,0 @@
|
|
1 |
-
from collections import Counter, defaultdict
|
2 |
-
from typing import List
|
3 |
-
|
4 |
-
import numpy as np
|
5 |
-
|
6 |
-
|
7 |
-
def get_servers_metrics(model_reports) -> List[str]:
|
8 |
-
servers_num_total = 0
|
9 |
-
servers_num_relay = 0
|
10 |
-
num_peers = 0
|
11 |
-
pings = []
|
12 |
-
num_ping_infs = 0
|
13 |
-
version_counts = Counter()
|
14 |
-
result = ["# SERVER LEVEL METRICS"]
|
15 |
-
|
16 |
-
for model_reports in model_reports:
|
17 |
-
for server in model_reports["server_rows"]:
|
18 |
-
if server["span"].server_info is not None:
|
19 |
-
next_pings = server["span"].server_info.next_pings
|
20 |
-
if next_pings is not None:
|
21 |
-
servers_num_total += 1
|
22 |
-
num_peers += len(next_pings)
|
23 |
-
pings_not_inf = [v for k, v in next_pings.items() if v != float("inf")]
|
24 |
-
pings.extend(pings_not_inf)
|
25 |
-
num_ping_infs += len([v for v in next_pings.values() if v == float("inf")])
|
26 |
-
|
27 |
-
if server["span"].server_info.using_relay:
|
28 |
-
servers_num_relay += 1
|
29 |
-
|
30 |
-
version = server["span"].server_info.version
|
31 |
-
if version:
|
32 |
-
version_counts[version] += 1
|
33 |
-
|
34 |
-
if servers_num_total > 0 and pings:
|
35 |
-
peers_per_srv = (len(pings) + num_ping_infs) / servers_num_total
|
36 |
-
pings_inf_share = num_ping_infs / (num_ping_infs + len(pings))
|
37 |
-
|
38 |
-
result.extend(
|
39 |
-
[
|
40 |
-
f"peers_per_srv {peers_per_srv:.1f}",
|
41 |
-
f"pings_inf_share {pings_inf_share:.3f}",
|
42 |
-
]
|
43 |
-
)
|
44 |
-
|
45 |
-
result.append(f"servers_num_total {servers_num_total}")
|
46 |
-
result.append(f"servers_num_relay {servers_num_relay}")
|
47 |
-
|
48 |
-
if pings:
|
49 |
-
result.append("# PINGS")
|
50 |
-
pings = np.sort(pings).tolist()
|
51 |
-
for pct in (25, 50, 75, 90, 95):
|
52 |
-
result.append(f'ping_pct{{pct="{pct}"}} {np.percentile(pings, pct):.4f}')
|
53 |
-
|
54 |
-
result.append("# VERSIONS")
|
55 |
-
for version_number, version_count in version_counts.items():
|
56 |
-
result.append(f'server_version{{version_number="{version_number}"}} {version_count}')
|
57 |
-
|
58 |
-
return result
|
59 |
-
|
60 |
-
|
61 |
-
def get_models_metrics(model_reports) -> List[str]:
|
62 |
-
result = [
|
63 |
-
"# MODEL LEVEL METRICS",
|
64 |
-
]
|
65 |
-
|
66 |
-
for model_reports in model_reports:
|
67 |
-
model_name = model_reports["dht_prefix"]
|
68 |
-
|
69 |
-
result.append(f"# MODEL: {model_name} {'-' * 50}")
|
70 |
-
|
71 |
-
blocks = defaultdict(lambda: np.zeros(model_reports["num_blocks"]))
|
72 |
-
|
73 |
-
for server in model_reports["server_rows"]:
|
74 |
-
for block_idx in range(server["span"].start, server["span"].end):
|
75 |
-
blocks["total"][block_idx] += 1
|
76 |
-
blocks[server["state"]][block_idx] += 1
|
77 |
-
|
78 |
-
if server["span"].server_info is not None:
|
79 |
-
for rps in ("network_rps", "inference_rps", "forward_rps"):
|
80 |
-
rps_value = getattr(server["span"].server_info, rps, 0)
|
81 |
-
if rps_value is not None:
|
82 |
-
blocks[rps][block_idx] += rps_value
|
83 |
-
|
84 |
-
result.extend(
|
85 |
-
[
|
86 |
-
f'n_blocks{{model="{model_name}"}} {model_reports["num_blocks"]}',
|
87 |
-
f'servers_num{{model="{model_name}"}} {len(model_reports["server_rows"])}',
|
88 |
-
f'blocks_total{{model="{model_name}"}} {blocks["total"].sum()}',
|
89 |
-
f'blocks_online_min{{model="{model_name}"}} {blocks["online"].min()}',
|
90 |
-
]
|
91 |
-
)
|
92 |
-
|
93 |
-
for block_state in ("online", "joining", "offline", "unreachable"):
|
94 |
-
result.append(f'blocks{{model="{model_name}",state="{block_state}"}} {blocks[block_state].sum():.0f}')
|
95 |
-
|
96 |
-
for rps in ("network_rps", "inference_rps", "forward_rps"):
|
97 |
-
rps_type = rps.split("_")[0]
|
98 |
-
result.append(f'rps_avg{{model="{model_name}",rps="{rps_type}"}} {blocks[rps].mean():.1f}')
|
99 |
-
result.append(f'rps_min{{model="{model_name}",rps="{rps_type}"}} {blocks[rps].min():.1f}')
|
100 |
-
|
101 |
-
return result
|
102 |
-
|
103 |
-
|
104 |
-
def get_prometheus_metrics(state_dict) -> str:
|
105 |
-
"""prepares metrics in Prometeus format
|
106 |
-
description: https://prometheus.io/docs/instrumenting/exposition_formats/
|
107 |
-
returns multline string with single metric per line
|
108 |
-
"""
|
109 |
-
result = []
|
110 |
-
|
111 |
-
result.append("# GENERAL METRICS")
|
112 |
-
result.append(f"update_duration {state_dict.get('update_duration', None):.1f}")
|
113 |
-
|
114 |
-
result.extend(get_servers_metrics(state_dict["model_reports"]))
|
115 |
-
|
116 |
-
result.extend(get_models_metrics(state_dict["model_reports"]))
|
117 |
-
|
118 |
-
return "\n".join(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|