Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.express as px | |
| from huggingface_hub.file_download import hf_hub_download | |
| from src.utils import process_model_name, process_model_arch | |
| from src.assets.css_html_js import custom_css | |
| from src.assets.text_content import ( | |
| TITLE, | |
| ABOUT_TEXT, | |
| INTRODUCTION_TEXT, | |
| EXAMPLE_CONFIG_TEXT, | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| ) | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset" | |
| MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB π₯οΈ"} | |
| ALL_COLUMNS_MAPPING = { | |
| # model | |
| "Model": "Model π€", | |
| "Arch": "Arch ποΈ", | |
| "Size": "Size π", | |
| # deployment settings | |
| "backend.name": "Backend π", | |
| "backend.torch_dtype": "Dtype π₯", | |
| "optimizations": "Optimizations π οΈ", | |
| "quantization": "Quantization ποΈ", | |
| # quality measurements | |
| "Score": "Avg Score (%) β¬οΈ", | |
| # throughput measurements | |
| "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) β¬οΈ", | |
| "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) β¬οΈ", | |
| # latency measurements | |
| "forward.latency(s)": "Prefill Latency (s) β¬οΈ", | |
| "generate.latency(s)": "E2E Latency (s) β¬οΈ", | |
| # memory measurements | |
| "generate.max_memory_allocated(MB)": "Allocated Memory (MB) β¬οΈ", | |
| "generate.max_memory_reserved(MB)": "Reserved Memory (MB) β¬οΈ", | |
| "generate.max_memory_used(MB)": "Used Memory (MB) β¬οΈ", | |
| # energy measurements | |
| "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) β¬οΈ", | |
| } | |
| SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"] | |
| SORTING_ASCENDING = [False, True] | |
| ALL_COLUMNS_DATATYPES = [ | |
| # open llm | |
| "markdown", | |
| "markdown", | |
| "number", | |
| # deployment settings | |
| "str", | |
| "str", | |
| "str", | |
| "str", | |
| # measurements | |
| "number", | |
| "number", | |
| "number", | |
| "number", | |
| "number", | |
| "number", | |
| "number", | |
| "number", | |
| "number", | |
| "number", | |
| ] | |
| def get_benchmark_df(machine="hf-dgx-01"): | |
| # download data | |
| hf_hub_download( | |
| repo_id="optimum/llm-perf-dataset", | |
| filename="open-llm.csv", | |
| local_dir="dataset", | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| ) | |
| hf_hub_download( | |
| repo_id="optimum/llm-perf-dataset", | |
| filename=f"{machine}/full-report.csv", | |
| local_dir="dataset", | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| ) | |
| open_llm = pd.read_csv("dataset/open-llm.csv") | |
| full_report = pd.read_csv(f"dataset/{machine}/full-report.csv") | |
| # merge on model | |
| merged_df = open_llm.merge(full_report, left_on="Model", right_on="model") | |
| # transpose energy consumption | |
| merged_df["generate.energy_consumption(tokens/kWh)"] = ( | |
| 1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1) | |
| ).astype(int) | |
| # fix nan values | |
| merged_df.loc[ | |
| merged_df["generate.energy_consumption(tokens/kWh)"] == 1, | |
| "generate.energy_consumption(tokens/kWh)", | |
| ] = pd.NA | |
| # add optimizations column | |
| merged_df["optimizations"] = merged_df[ | |
| ["backend.to_bettertransformer", "backend.use_flash_attention_2"] | |
| ].apply( | |
| lambda x: "BetterTransformer" | |
| if x["backend.to_bettertransformer"] | |
| else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"), | |
| axis=1, | |
| ) | |
| # add quantization scheme | |
| merged_df["quantization"] = merged_df["backend.quantization_scheme"].apply( | |
| lambda x: "BnB.4bit" if x == "bnb" else ("GPTQ.4bit" if x == "gptq" else "None") | |
| ) | |
| # add decode throughput | |
| merged_df["decode.throughput(tokens/s)"] = ( | |
| 1000 / (merged_df["generate.latency(s)"] - merged_df["forward.latency(s)"]) | |
| ).round(2) | |
| # sort by metric | |
| merged_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True) | |
| # filter columns | |
| merged_df = merged_df[list(ALL_COLUMNS_MAPPING.keys())] | |
| # rename columns | |
| merged_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True) | |
| return merged_df | |
| def get_benchmark_table(bench_df): | |
| copy_df = bench_df.copy() | |
| # transform | |
| copy_df["Model π€"] = copy_df["Model π€"].apply(process_model_name) | |
| copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch) | |
| # process quantization | |
| copy_df["Avg Score (%) β¬οΈ"] = copy_df.apply( | |
| lambda x: f"{x['Avg Score (%) β¬οΈ']}**" | |
| if x["Quantization ποΈ"] in ["BnB.4bit", "GPTQ.4bit"] | |
| else x["Avg Score (%) β¬οΈ"], | |
| axis=1, | |
| ) | |
| return copy_df | |
| def get_benchmark_chart(bench_df): | |
| copy_df = bench_df.copy() | |
| # transform | |
| copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch) | |
| # plot | |
| fig = px.scatter( | |
| copy_df, | |
| y="Avg Score (%) β¬οΈ", | |
| x="E2E Latency (s) β¬οΈ", | |
| size="Allocated Memory (MB) β¬οΈ", | |
| color="Arch ποΈ", | |
| custom_data=list(ALL_COLUMNS_MAPPING.values()), | |
| color_discrete_sequence=px.colors.qualitative.Light24, | |
| ) | |
| fig.update_layout( | |
| title={ | |
| "text": "Latency vs. Score vs. Memory", | |
| "y": 0.95, | |
| "x": 0.5, | |
| "xanchor": "center", | |
| "yanchor": "top", | |
| }, | |
| xaxis_title="Per 1000 Tokens Latency (s)", | |
| yaxis_title="Avg Open LLM Score (%)", | |
| legend_title="LLM Architecture", | |
| width=1200, | |
| height=600, | |
| ) | |
| fig.update_traces( | |
| hovertemplate="<br>".join( | |
| [ | |
| f"<b>{column}:</b> %{{customdata[{i}]}}" | |
| for i, column in enumerate(ALL_COLUMNS_MAPPING.values()) | |
| ] | |
| ) | |
| ) | |
| return fig | |
| def filter_query( | |
| text, | |
| backends, | |
| datatypes, | |
| optimizations, | |
| quantization_scheme, | |
| score, | |
| memory, | |
| machine, | |
| ): | |
| raw_df = get_benchmark_df(machine=machine) | |
| filtered_df = raw_df[ | |
| raw_df["Model π€"].str.contains(text, case=False) | |
| & raw_df["Backend π"].isin(backends) | |
| & raw_df["Dtype π₯"].isin(datatypes) | |
| & ( | |
| pd.concat( | |
| [ | |
| raw_df["Optimizations π οΈ"].str.contains(optimization, case=False) | |
| for optimization in optimizations | |
| ], | |
| axis=1, | |
| ).any(axis="columns") | |
| if len(optimizations) > 0 | |
| else True | |
| ) | |
| & ( | |
| pd.concat( | |
| [ | |
| raw_df["Quantization ποΈ"].str.contains(quantization, case=False) | |
| for quantization in quantization_scheme | |
| ], | |
| axis=1, | |
| ).any(axis="columns") | |
| if len(quantization_scheme) > 0 | |
| else True | |
| ) | |
| & (raw_df["Avg Score (%) β¬οΈ"] >= score) | |
| & (raw_df["Allocated Memory (MB) β¬οΈ"] <= memory) | |
| ] | |
| filtered_table = get_benchmark_table(filtered_df) | |
| filtered_chart = get_benchmark_chart(filtered_df) | |
| return filtered_table, filtered_chart | |
| # Demo interface | |
| demo = gr.Blocks(css=custom_css) | |
| with demo: | |
| # leaderboard title | |
| gr.HTML(TITLE) | |
| # introduction text | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="descriptive-text") | |
| with gr.Tabs(elem_classes="leaderboard-tabs"): | |
| machine_placeholders = {} | |
| machine_tables = {} | |
| machine_plots = {} | |
| ####################### HARDWARE TABS ####################### | |
| for i, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()): | |
| # dummy placeholder of the machine name | |
| machine_placeholders[machine] = gr.Textbox(value=machine, visible=False) | |
| with gr.TabItem(hardware, id=i): | |
| with gr.Tabs(elem_classes="machine-tabs"): | |
| # placeholder for full dataframe | |
| machine_df = get_benchmark_df(machine=machine) | |
| with gr.TabItem("Leaderboard π ", id=0): | |
| gr.HTML( | |
| "π Scroll to the right π for additional columns.", | |
| elem_id="descriptive-text", | |
| ) | |
| # Original leaderboard table | |
| machine_tables[machine] = gr.components.Dataframe( | |
| value=get_benchmark_table(machine_df), | |
| headers=list(ALL_COLUMNS_MAPPING.values()), | |
| datatype=ALL_COLUMNS_DATATYPES, | |
| elem_id="machine-table", | |
| ) | |
| with gr.TabItem("Plot π", id=1): | |
| gr.HTML( | |
| "π Hover over the points π for additional information.", | |
| elem_id="descriptive-text", | |
| ) | |
| # Original leaderboard plot | |
| machine_plots[machine] = gr.components.Plot( | |
| value=get_benchmark_chart(machine_df), | |
| elem_id="machine-plot", | |
| show_label=False, | |
| ) | |
| ###################### CONTROL PANEL ####################### | |
| with gr.TabItem("Control Panel ποΈ", id=2): | |
| gr.HTML( | |
| "Use this control panel to filter the leaderboard's table and plot.", # noqa: E501 | |
| elem_id="descriptive-text", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| search_bar = gr.Textbox( | |
| label="Model π€", | |
| info="π Search for a model name", | |
| elem_id="search-bar", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| with gr.Box(): | |
| score_slider = gr.Slider( | |
| label="Open LLM Score π", | |
| info="ποΈ Slide to minimum Open LLM score", | |
| value=0, | |
| elem_id="threshold-slider", | |
| ) | |
| with gr.Column(scale=1): | |
| with gr.Box(): | |
| memory_slider = gr.Slider( | |
| label="Peak Memory (MB) π", | |
| info="ποΈ Slide to maximum Peak Memory", | |
| minimum=0, | |
| maximum=80 * 1024, | |
| value=80 * 1024, | |
| elem_id="memory-slider", | |
| ) | |
| with gr.Column(scale=1): | |
| backend_checkboxes = gr.CheckboxGroup( | |
| label="Backends π", | |
| choices=["pytorch", "onnxruntime"], | |
| value=["pytorch", "onnxruntime"], | |
| info="βοΈ Select the backends", | |
| elem_id="backend-checkboxes", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| datatype_checkboxes = gr.CheckboxGroup( | |
| label="Load Dtypes π₯", | |
| choices=["float32", "float16"], | |
| value=["float32", "float16"], | |
| info="βοΈ Select the load dtypes", | |
| elem_id="dtype-checkboxes", | |
| ) | |
| with gr.Column(scale=1): | |
| optimizations_checkboxes = gr.CheckboxGroup( | |
| label="Optimizations π οΈ", | |
| choices=["None", "BetterTransformer"], | |
| value=["None", "BetterTransformer"], | |
| info="βοΈ Select the optimizations", | |
| elem_id="optimizations-checkboxes", | |
| ) | |
| with gr.Column(scale=1): | |
| quantization_checkboxes = gr.CheckboxGroup( | |
| label="Quantizations ποΈ", | |
| choices=["None", "BnB.4bit", "GPTQ.4bit"], | |
| value=["None", "BnB.4bit", "GPTQ.4bit"], | |
| info="βοΈ Select the quantization schemes", | |
| elem_id="quantization-checkboxes", | |
| ) | |
| with gr.Row(): | |
| filter_button = gr.Button( | |
| value="Filter π", | |
| elem_id="filter-button", | |
| ) | |
| for machine in MACHINE_TO_HARDWARE: | |
| filter_button.click( | |
| filter_query, | |
| [ | |
| search_bar, | |
| backend_checkboxes, | |
| datatype_checkboxes, | |
| optimizations_checkboxes, | |
| quantization_checkboxes, | |
| score_slider, | |
| memory_slider, | |
| machine_placeholders[machine], | |
| ], | |
| [machine_tables[machine], machine_plots[machine]], | |
| ) | |
| ####################### ABOUT TAB ####################### | |
| with gr.TabItem("About π", id=3): | |
| gr.HTML(ABOUT_TEXT, elem_classes="descriptive-text") | |
| gr.Markdown(EXAMPLE_CONFIG_TEXT, elem_classes="descriptive-text") | |
| ####################### CITATION ####################### | |
| with gr.Row(): | |
| with gr.Accordion("π Citation", open=False): | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| # Launch demo | |
| demo.queue().launch() | |