Commit
Β·
f3dc796
1
Parent(s):
29e37fd
update
Browse files- app.py +4 -6
- src/assets/text_content.py +8 -6
app.py
CHANGED
|
@@ -24,12 +24,14 @@ ALL_COLUMNS_MAPPING = {
|
|
| 24 |
# model
|
| 25 |
"Model": "Model π€",
|
| 26 |
"Arch": "Arch ποΈ",
|
| 27 |
-
"Size": "Size
|
| 28 |
# deployment settings
|
| 29 |
"backend.name": "Backend π",
|
| 30 |
"backend.torch_dtype": "Dtype π₯",
|
| 31 |
"optimizations": "Optimizations π οΈ",
|
| 32 |
"quantization": "Quantization ποΈ",
|
|
|
|
|
|
|
| 33 |
# throughput measurements
|
| 34 |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) β¬οΈ",
|
| 35 |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) β¬οΈ",
|
|
@@ -42,8 +44,6 @@ ALL_COLUMNS_MAPPING = {
|
|
| 42 |
"generate.max_memory_used(MB)": "Used Memory (MB) β¬οΈ",
|
| 43 |
# energy measurements
|
| 44 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) β¬οΈ",
|
| 45 |
-
# quality measurements
|
| 46 |
-
"Score": "Avg Score (%) β¬οΈ",
|
| 47 |
}
|
| 48 |
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
|
| 49 |
SORTING_ASCENDING = [False, True]
|
|
@@ -148,9 +148,7 @@ def get_benchmark_chart(bench_df):
|
|
| 148 |
copy_df = bench_df.copy()
|
| 149 |
# transform
|
| 150 |
copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch)
|
| 151 |
-
#
|
| 152 |
-
# copy_df = copy_df[copy_df["E2E Latency (s) β¬οΈ"] <= 100]
|
| 153 |
-
|
| 154 |
fig = px.scatter(
|
| 155 |
copy_df,
|
| 156 |
y="Avg Score (%) β¬οΈ",
|
|
|
|
| 24 |
# model
|
| 25 |
"Model": "Model π€",
|
| 26 |
"Arch": "Arch ποΈ",
|
| 27 |
+
"Size": "Size π",
|
| 28 |
# deployment settings
|
| 29 |
"backend.name": "Backend π",
|
| 30 |
"backend.torch_dtype": "Dtype π₯",
|
| 31 |
"optimizations": "Optimizations π οΈ",
|
| 32 |
"quantization": "Quantization ποΈ",
|
| 33 |
+
# quality measurements
|
| 34 |
+
"Score": "Avg Score (%) β¬οΈ",
|
| 35 |
# throughput measurements
|
| 36 |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) β¬οΈ",
|
| 37 |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) β¬οΈ",
|
|
|
|
| 44 |
"generate.max_memory_used(MB)": "Used Memory (MB) β¬οΈ",
|
| 45 |
# energy measurements
|
| 46 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) β¬οΈ",
|
|
|
|
|
|
|
| 47 |
}
|
| 48 |
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
|
| 49 |
SORTING_ASCENDING = [False, True]
|
|
|
|
| 148 |
copy_df = bench_df.copy()
|
| 149 |
# transform
|
| 150 |
copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch)
|
| 151 |
+
# plot
|
|
|
|
|
|
|
| 152 |
fig = px.scatter(
|
| 153 |
copy_df,
|
| 154 |
y="Avg Score (%) β¬οΈ",
|
src/assets/text_content.py
CHANGED
|
@@ -12,7 +12,7 @@ ABOUT_TEXT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
|
| 12 |
<ul>
|
| 13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
| 14 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
| 15 |
-
<li>LLMs are running on a singleton batch with a prompt size of
|
| 16 |
<li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
|
| 17 |
<li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
|
| 18 |
<li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
|
|
@@ -44,19 +44,21 @@ device: cuda
|
|
| 44 |
|
| 45 |
backend:
|
| 46 |
no_weights: true
|
| 47 |
-
delete_cache: true
|
| 48 |
torch_dtype: float16
|
| 49 |
-
quantization_strategy: gptq
|
| 50 |
bettertransformer: true
|
|
|
|
|
|
|
| 51 |
|
| 52 |
benchmark:
|
| 53 |
memory: true
|
| 54 |
-
|
|
|
|
|
|
|
| 55 |
input_shapes:
|
| 56 |
batch_size: 1
|
| 57 |
-
sequence_length:
|
|
|
|
| 58 |
|
| 59 |
-
new_tokens: 1000
|
| 60 |
```
|
| 61 |
"""
|
| 62 |
|
|
|
|
| 12 |
<ul>
|
| 13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
| 14 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
| 15 |
+
<li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 1000 tokens.</li>
|
| 16 |
<li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
|
| 17 |
<li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
|
| 18 |
<li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
|
|
|
|
| 44 |
|
| 45 |
backend:
|
| 46 |
no_weights: true
|
|
|
|
| 47 |
torch_dtype: float16
|
|
|
|
| 48 |
bettertransformer: true
|
| 49 |
+
quantization_scheme: gptq
|
| 50 |
+
|
| 51 |
|
| 52 |
benchmark:
|
| 53 |
memory: true
|
| 54 |
+
energy: true
|
| 55 |
+
|
| 56 |
+
new_tokens: 1000
|
| 57 |
input_shapes:
|
| 58 |
batch_size: 1
|
| 59 |
+
sequence_length: 256
|
| 60 |
+
|
| 61 |
|
|
|
|
| 62 |
```
|
| 63 |
"""
|
| 64 |
|