Spaces:
Runtime error
Runtime error
Delete metrics.py
Browse files- metrics.py +0 -118
metrics.py
DELETED
|
@@ -1,118 +0,0 @@
|
|
| 1 |
-
from collections import Counter, defaultdict
|
| 2 |
-
from typing import List
|
| 3 |
-
|
| 4 |
-
import numpy as np
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
def get_servers_metrics(model_reports) -> List[str]:
|
| 8 |
-
servers_num_total = 0
|
| 9 |
-
servers_num_relay = 0
|
| 10 |
-
num_peers = 0
|
| 11 |
-
pings = []
|
| 12 |
-
num_ping_infs = 0
|
| 13 |
-
version_counts = Counter()
|
| 14 |
-
result = ["# SERVER LEVEL METRICS"]
|
| 15 |
-
|
| 16 |
-
for model_reports in model_reports:
|
| 17 |
-
for server in model_reports["server_rows"]:
|
| 18 |
-
if server["span"].server_info is not None:
|
| 19 |
-
next_pings = server["span"].server_info.next_pings
|
| 20 |
-
if next_pings is not None:
|
| 21 |
-
servers_num_total += 1
|
| 22 |
-
num_peers += len(next_pings)
|
| 23 |
-
pings_not_inf = [v for k, v in next_pings.items() if v != float("inf")]
|
| 24 |
-
pings.extend(pings_not_inf)
|
| 25 |
-
num_ping_infs += len([v for v in next_pings.values() if v == float("inf")])
|
| 26 |
-
|
| 27 |
-
if server["span"].server_info.using_relay:
|
| 28 |
-
servers_num_relay += 1
|
| 29 |
-
|
| 30 |
-
version = server["span"].server_info.version
|
| 31 |
-
if version:
|
| 32 |
-
version_counts[version] += 1
|
| 33 |
-
|
| 34 |
-
if servers_num_total > 0 and pings:
|
| 35 |
-
peers_per_srv = (len(pings) + num_ping_infs) / servers_num_total
|
| 36 |
-
pings_inf_share = num_ping_infs / (num_ping_infs + len(pings))
|
| 37 |
-
|
| 38 |
-
result.extend(
|
| 39 |
-
[
|
| 40 |
-
f"peers_per_srv {peers_per_srv:.1f}",
|
| 41 |
-
f"pings_inf_share {pings_inf_share:.3f}",
|
| 42 |
-
]
|
| 43 |
-
)
|
| 44 |
-
|
| 45 |
-
result.append(f"servers_num_total {servers_num_total}")
|
| 46 |
-
result.append(f"servers_num_relay {servers_num_relay}")
|
| 47 |
-
|
| 48 |
-
if pings:
|
| 49 |
-
result.append("# PINGS")
|
| 50 |
-
pings = np.sort(pings).tolist()
|
| 51 |
-
for pct in (25, 50, 75, 90, 95):
|
| 52 |
-
result.append(f'ping_pct{{pct="{pct}"}} {np.percentile(pings, pct):.4f}')
|
| 53 |
-
|
| 54 |
-
result.append("# VERSIONS")
|
| 55 |
-
for version_number, version_count in version_counts.items():
|
| 56 |
-
result.append(f'server_version{{version_number="{version_number}"}} {version_count}')
|
| 57 |
-
|
| 58 |
-
return result
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
def get_models_metrics(model_reports) -> List[str]:
|
| 62 |
-
result = [
|
| 63 |
-
"# MODEL LEVEL METRICS",
|
| 64 |
-
]
|
| 65 |
-
|
| 66 |
-
for model_reports in model_reports:
|
| 67 |
-
model_name = model_reports["dht_prefix"]
|
| 68 |
-
|
| 69 |
-
result.append(f"# MODEL: {model_name} {'-' * 50}")
|
| 70 |
-
|
| 71 |
-
blocks = defaultdict(lambda: np.zeros(model_reports["num_blocks"]))
|
| 72 |
-
|
| 73 |
-
for server in model_reports["server_rows"]:
|
| 74 |
-
for block_idx in range(server["span"].start, server["span"].end):
|
| 75 |
-
blocks["total"][block_idx] += 1
|
| 76 |
-
blocks[server["state"]][block_idx] += 1
|
| 77 |
-
|
| 78 |
-
if server["span"].server_info is not None:
|
| 79 |
-
for rps in ("network_rps", "inference_rps", "forward_rps"):
|
| 80 |
-
rps_value = getattr(server["span"].server_info, rps, 0)
|
| 81 |
-
if rps_value is not None:
|
| 82 |
-
blocks[rps][block_idx] += rps_value
|
| 83 |
-
|
| 84 |
-
result.extend(
|
| 85 |
-
[
|
| 86 |
-
f'n_blocks{{model="{model_name}"}} {model_reports["num_blocks"]}',
|
| 87 |
-
f'servers_num{{model="{model_name}"}} {len(model_reports["server_rows"])}',
|
| 88 |
-
f'blocks_total{{model="{model_name}"}} {blocks["total"].sum()}',
|
| 89 |
-
f'blocks_online_min{{model="{model_name}"}} {blocks["online"].min()}',
|
| 90 |
-
]
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
for block_state in ("online", "joining", "offline", "unreachable"):
|
| 94 |
-
result.append(f'blocks{{model="{model_name}",state="{block_state}"}} {blocks[block_state].sum():.0f}')
|
| 95 |
-
|
| 96 |
-
for rps in ("network_rps", "inference_rps", "forward_rps"):
|
| 97 |
-
rps_type = rps.split("_")[0]
|
| 98 |
-
result.append(f'rps_avg{{model="{model_name}",rps="{rps_type}"}} {blocks[rps].mean():.1f}')
|
| 99 |
-
result.append(f'rps_min{{model="{model_name}",rps="{rps_type}"}} {blocks[rps].min():.1f}')
|
| 100 |
-
|
| 101 |
-
return result
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
def get_prometheus_metrics(state_dict) -> str:
|
| 105 |
-
"""prepares metrics in Prometeus format
|
| 106 |
-
description: https://prometheus.io/docs/instrumenting/exposition_formats/
|
| 107 |
-
returns multline string with single metric per line
|
| 108 |
-
"""
|
| 109 |
-
result = []
|
| 110 |
-
|
| 111 |
-
result.append("# GENERAL METRICS")
|
| 112 |
-
result.append(f"update_duration {state_dict.get('update_duration', None):.1f}")
|
| 113 |
-
|
| 114 |
-
result.extend(get_servers_metrics(state_dict["model_reports"]))
|
| 115 |
-
|
| 116 |
-
result.extend(get_models_metrics(state_dict["model_reports"]))
|
| 117 |
-
|
| 118 |
-
return "\n".join(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|