ldhldh commited on
Commit
8463451
1 Parent(s): 28e2fc4

Delete metrics.py

Browse files
Files changed (1) hide show
  1. metrics.py +0 -118
metrics.py DELETED
@@ -1,118 +0,0 @@
1
- from collections import Counter, defaultdict
2
- from typing import List
3
-
4
- import numpy as np
5
-
6
-
7
- def get_servers_metrics(model_reports) -> List[str]:
8
- servers_num_total = 0
9
- servers_num_relay = 0
10
- num_peers = 0
11
- pings = []
12
- num_ping_infs = 0
13
- version_counts = Counter()
14
- result = ["# SERVER LEVEL METRICS"]
15
-
16
- for model_reports in model_reports:
17
- for server in model_reports["server_rows"]:
18
- if server["span"].server_info is not None:
19
- next_pings = server["span"].server_info.next_pings
20
- if next_pings is not None:
21
- servers_num_total += 1
22
- num_peers += len(next_pings)
23
- pings_not_inf = [v for k, v in next_pings.items() if v != float("inf")]
24
- pings.extend(pings_not_inf)
25
- num_ping_infs += len([v for v in next_pings.values() if v == float("inf")])
26
-
27
- if server["span"].server_info.using_relay:
28
- servers_num_relay += 1
29
-
30
- version = server["span"].server_info.version
31
- if version:
32
- version_counts[version] += 1
33
-
34
- if servers_num_total > 0 and pings:
35
- peers_per_srv = (len(pings) + num_ping_infs) / servers_num_total
36
- pings_inf_share = num_ping_infs / (num_ping_infs + len(pings))
37
-
38
- result.extend(
39
- [
40
- f"peers_per_srv {peers_per_srv:.1f}",
41
- f"pings_inf_share {pings_inf_share:.3f}",
42
- ]
43
- )
44
-
45
- result.append(f"servers_num_total {servers_num_total}")
46
- result.append(f"servers_num_relay {servers_num_relay}")
47
-
48
- if pings:
49
- result.append("# PINGS")
50
- pings = np.sort(pings).tolist()
51
- for pct in (25, 50, 75, 90, 95):
52
- result.append(f'ping_pct{{pct="{pct}"}} {np.percentile(pings, pct):.4f}')
53
-
54
- result.append("# VERSIONS")
55
- for version_number, version_count in version_counts.items():
56
- result.append(f'server_version{{version_number="{version_number}"}} {version_count}')
57
-
58
- return result
59
-
60
-
61
- def get_models_metrics(model_reports) -> List[str]:
62
- result = [
63
- "# MODEL LEVEL METRICS",
64
- ]
65
-
66
- for model_reports in model_reports:
67
- model_name = model_reports["dht_prefix"]
68
-
69
- result.append(f"# MODEL: {model_name} {'-' * 50}")
70
-
71
- blocks = defaultdict(lambda: np.zeros(model_reports["num_blocks"]))
72
-
73
- for server in model_reports["server_rows"]:
74
- for block_idx in range(server["span"].start, server["span"].end):
75
- blocks["total"][block_idx] += 1
76
- blocks[server["state"]][block_idx] += 1
77
-
78
- if server["span"].server_info is not None:
79
- for rps in ("network_rps", "inference_rps", "forward_rps"):
80
- rps_value = getattr(server["span"].server_info, rps, 0)
81
- if rps_value is not None:
82
- blocks[rps][block_idx] += rps_value
83
-
84
- result.extend(
85
- [
86
- f'n_blocks{{model="{model_name}"}} {model_reports["num_blocks"]}',
87
- f'servers_num{{model="{model_name}"}} {len(model_reports["server_rows"])}',
88
- f'blocks_total{{model="{model_name}"}} {blocks["total"].sum()}',
89
- f'blocks_online_min{{model="{model_name}"}} {blocks["online"].min()}',
90
- ]
91
- )
92
-
93
- for block_state in ("online", "joining", "offline", "unreachable"):
94
- result.append(f'blocks{{model="{model_name}",state="{block_state}"}} {blocks[block_state].sum():.0f}')
95
-
96
- for rps in ("network_rps", "inference_rps", "forward_rps"):
97
- rps_type = rps.split("_")[0]
98
- result.append(f'rps_avg{{model="{model_name}",rps="{rps_type}"}} {blocks[rps].mean():.1f}')
99
- result.append(f'rps_min{{model="{model_name}",rps="{rps_type}"}} {blocks[rps].min():.1f}')
100
-
101
- return result
102
-
103
-
104
- def get_prometheus_metrics(state_dict) -> str:
105
- """prepares metrics in Prometeus format
106
- description: https://prometheus.io/docs/instrumenting/exposition_formats/
107
- returns multline string with single metric per line
108
- """
109
- result = []
110
-
111
- result.append("# GENERAL METRICS")
112
- result.append(f"update_duration {state_dict.get('update_duration', None):.1f}")
113
-
114
- result.extend(get_servers_metrics(state_dict["model_reports"]))
115
-
116
- result.extend(get_models_metrics(state_dict["model_reports"]))
117
-
118
- return "\n".join(result)