Spaces:
Sleeping
Sleeping
Commit
Β·
a1f6c2e
1
Parent(s):
bd9edb7
update viz
Browse files- src/latency_score_memory.py +1 -1
- src/llm_perf.py +9 -8
src/latency_score_memory.py
CHANGED
|
@@ -15,7 +15,7 @@ SCORE_MEMORY_LATENCY_DATA = [
|
|
| 15 |
"Decode Throughput (tokens/s)",
|
| 16 |
"Allocated Memory (MB)",
|
| 17 |
"E2E Latency (s)",
|
| 18 |
-
"E2E Throughput (tokens/s)",
|
| 19 |
]
|
| 20 |
|
| 21 |
|
|
|
|
| 15 |
"Decode Throughput (tokens/s)",
|
| 16 |
"Allocated Memory (MB)",
|
| 17 |
"E2E Latency (s)",
|
| 18 |
+
# "E2E Throughput (tokens/s)",
|
| 19 |
]
|
| 20 |
|
| 21 |
|
src/llm_perf.py
CHANGED
|
@@ -12,22 +12,23 @@ COLUMNS_MAPPING = {
|
|
| 12 |
"Model": "Model π€",
|
| 13 |
"Arch": "Arch ποΈ",
|
| 14 |
"Size": "Params (B)",
|
| 15 |
-
|
| 16 |
-
# deployment settings
|
| 17 |
-
"backend.name": "Backend π",
|
| 18 |
-
"backend.torch_dtype": "DType π₯",
|
| 19 |
-
"optimization": "Optimization π οΈ",
|
| 20 |
-
"quantization": "Quantization ποΈ",
|
| 21 |
# primary measurements
|
| 22 |
"forward.latency(s)": "Prefill Latency (s)",
|
| 23 |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
|
| 24 |
"generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
|
| 25 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# additional measurements
|
|
|
|
| 27 |
"generate.latency(s)": "E2E Latency (s)",
|
| 28 |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
|
| 29 |
-
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
|
| 30 |
-
"generate.max_memory_used(MB)": "Used Memory (MB)",
|
| 31 |
}
|
| 32 |
SORTING_COLUMNS = [
|
| 33 |
"Open LLM Score (%)",
|
|
|
|
| 12 |
"Model": "Model π€",
|
| 13 |
"Arch": "Arch ποΈ",
|
| 14 |
"Size": "Params (B)",
|
| 15 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
# primary measurements
|
| 17 |
"forward.latency(s)": "Prefill Latency (s)",
|
| 18 |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
|
| 19 |
"generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
|
| 20 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
|
| 21 |
+
# deployment settings
|
| 22 |
+
"backend.name": "Backend π",
|
| 23 |
+
"backend.torch_dtype": "DType π₯",
|
| 24 |
+
"optimization": "Optimization π οΈ",
|
| 25 |
+
"quantization": "Quantization ποΈ",
|
| 26 |
# additional measurements
|
| 27 |
+
"Score": "Open LLM Score (%)",
|
| 28 |
"generate.latency(s)": "E2E Latency (s)",
|
| 29 |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
|
| 30 |
+
# "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
|
| 31 |
+
# "generate.max_memory_used(MB)": "Used Memory (MB)",
|
| 32 |
}
|
| 33 |
SORTING_COLUMNS = [
|
| 34 |
"Open LLM Score (%)",
|