llm-perf-leaderboard

Running

IlyasMoutawwakil HF Staff commited on Nov 16, 2023

Commit

f3dc796

1 Parent(s): 29e37fd

update

Files changed (2) hide show

app.py CHANGED Viewed

@@ -24,12 +24,14 @@ ALL_COLUMNS_MAPPING = {
     # model
     "Model": "Model 🤗",
     "Arch": "Arch 🏛️",
-    "Size": "Size 🏋️",
     # deployment settings
     "backend.name": "Backend 🏭",
     "backend.torch_dtype": "Dtype 📥",
     "optimizations": "Optimizations 🛠️",
     "quantization": "Quantization 🗜️",
     # throughput measurements
     "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
     "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
@@ -42,8 +44,6 @@ ALL_COLUMNS_MAPPING = {
     "generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
     # energy measurements
     "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
-    # quality measurements
-    "Score": "Avg Score (%) ⬆️",
 }
 SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
 SORTING_ASCENDING = [False, True]
@@ -148,9 +148,7 @@ def get_benchmark_chart(bench_df):
     copy_df = bench_df.copy()
     # transform
     copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
-    # filter latency bigger than 100s
-    # copy_df = copy_df[copy_df["E2E Latency (s) ⬇️"] <= 100]
     fig = px.scatter(
         copy_df,
         y="Avg Score (%) ⬆️",

     # model
     "Model": "Model 🤗",
     "Arch": "Arch 🏛️",
+    "Size": "Size 📏",
     # deployment settings
     "backend.name": "Backend 🏭",
     "backend.torch_dtype": "Dtype 📥",
     "optimizations": "Optimizations 🛠️",
     "quantization": "Quantization 🗜️",
+    # quality measurements
+    "Score": "Avg Score (%) ⬆️",
     # throughput measurements
     "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
     "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
     "generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
     # energy measurements
     "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
 }
 SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
 SORTING_ASCENDING = [False, True]
     copy_df = bench_df.copy()
     # transform
     copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
+    # plot
     fig = px.scatter(
         copy_df,
         y="Avg Score (%) ⬆️",

src/assets/text_content.py CHANGED Viewed

@@ -12,7 +12,7 @@ ABOUT_TEXT = """<h3>About the 🤗 LLM-Perf Leaderboard 🏋️</h3>
 <ul>
     <li>To avoid communication-dependent results, only one GPU is used.</li>
     <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
-    <li>LLMs are running on a singleton batch with a prompt size of 512 and generating a 1000 tokens.</li>
     <li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
     <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
     <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
@@ -44,19 +44,21 @@ device: cuda
 backend:
   no_weights: true
-  delete_cache: true
   torch_dtype: float16
-  quantization_strategy: gptq
   bettertransformer: true
 benchmark:
   memory: true
   input_shapes:
     batch_size: 1
-    sequence_length: 512
-  new_tokens: 1000
 ```
 """

 <ul>
     <li>To avoid communication-dependent results, only one GPU is used.</li>
     <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
+    <li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 1000 tokens.</li>
     <li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
     <li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
     <li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
 backend:
   no_weights: true
   torch_dtype: float16
   bettertransformer: true
+  quantization_scheme: gptq
 benchmark:
   memory: true
+  energy: true
+  new_tokens: 1000
   input_shapes:
     batch_size: 1
+    sequence_length: 256
 ```
 """