Commit 
							
							·
						
						9ceb843
	
1
								Parent(s):
							
							b514443
								
update
Browse files- .gitignore +2 -0
 - app.py +89 -105
 - src/md.py +28 -0
 - src/utils.py +60 -0
 
    	
        .gitignore
    CHANGED
    
    | 
         @@ -1 +1,3 @@ 
     | 
|
| 1 | 
         
             
            evals/
         
     | 
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
             
            evals/
         
     | 
| 2 | 
         
            +
            __pycache__/*
         
     | 
| 3 | 
         
            +
            *.pyc
         
     | 
    	
        app.py
    CHANGED
    
    | 
         @@ -1,131 +1,115 @@ 
     | 
|
| 1 | 
         
             
            import gradio as gr
         
     | 
| 2 | 
         
            -
            import pandas as pd
         
     | 
| 3 | 
         
            -
            from pathlib import Path
         
     | 
| 4 | 
         
            -
            from datasets import load_dataset
         
     | 
| 5 | 
         
             
            import os
         
     | 
| 6 | 
         
            -
            from huggingface_hub import HfApi,  
     | 
| 
         | 
|
| 
         | 
|
| 7 | 
         
             
            import numpy as np
         
     | 
| 8 | 
         | 
| 9 | 
         
             
            api = HfApi()
         
     | 
| 10 | 
         | 
| 11 | 
         
             
            COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
         
     | 
| 12 | 
         
             
            evals_repo = "ai2-rlhf-collab/rm-benchmark-results"
         
     | 
| 13 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 14 | 
         
             
            # def restart_space():
         
     | 
| 15 | 
         
             
            #     api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
         
     | 
| 16 | 
         | 
| 17 | 
         | 
| 18 | 
         
            -
            # From Open LLM Leaderboard
         
     | 
| 19 | 
         
            -
            def model_hyperlink(link, model_name):
         
     | 
| 20 | 
         
            -
                return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         
     | 
| 21 | 
         | 
| 22 | 
         
             
            print("Pulling evaluation results")
         
     | 
| 23 | 
         
            -
            repo =  
     | 
| 24 | 
         
            -
                local_dir= 
     | 
| 25 | 
         
            -
                 
     | 
| 26 | 
         
            -
                 
     | 
| 
         | 
|
| 27 | 
         
             
                repo_type="dataset",
         
     | 
| 28 | 
         
             
            )
         
     | 
| 29 | 
         
            -
            repo.git_pull()
         
     | 
| 30 | 
         
            -
             
     | 
| 31 | 
         
            -
            # Define a function to fetch and process data
         
     | 
| 32 | 
         
            -
            def fetch_and_display_data():    # use HF api to pull the git repo
         
     | 
| 33 | 
         
            -
                dir = Path(BASE_DIR)
         
     | 
| 34 | 
         
            -
                data_dir = dir / "data"
         
     | 
| 35 | 
         
            -
                orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
         
     | 
| 36 | 
         
            -
                # get all files within the sub folders orgs
         
     | 
| 37 | 
         
            -
                models_results = []
         
     | 
| 38 | 
         
            -
                for org in orgs:
         
     | 
| 39 | 
         
            -
                    org_dir = data_dir / org
         
     | 
| 40 | 
         
            -
                    files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
         
     | 
| 41 | 
         
            -
                    for file in files:
         
     | 
| 42 | 
         
            -
                        if file.endswith(".json"):
         
     | 
| 43 | 
         
            -
                            models_results.append(org + "/" + file)
         
     | 
| 44 | 
         
            -
             
     | 
| 45 | 
         
            -
                # create empty dataframe to add all data to
         
     | 
| 46 | 
         
            -
                df = pd.DataFrame()
         
     | 
| 47 | 
         
            -
             
     | 
| 48 | 
         
            -
                # load all json data in the list models_results one by one to avoid not having the same entries
         
     | 
| 49 | 
         
            -
                for model in models_results:
         
     | 
| 50 | 
         
            -
                    model_data = load_dataset("json", data_files=BASE_DIR + "data/" + model, split="train")
         
     | 
| 51 | 
         
            -
                    df2 = pd.DataFrame(model_data)
         
     | 
| 52 | 
         
            -
                    # add to df
         
     | 
| 53 | 
         
            -
                    df = pd.concat([df2, df])
         
     | 
| 54 | 
         | 
| 55 | 
         
            -
             
     | 
| 56 | 
         
            -
                 
     | 
| 57 | 
         
            -
                 
     | 
| 58 | 
         
            -
             
     | 
| 59 | 
         
            -
                 
     | 
| 60 | 
         
            -
                 
     | 
| 61 | 
         
            -
                 
     | 
| 62 | 
         
            -
             
     | 
| 63 | 
         
            -
             
     | 
| 64 | 
         
            -
             
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
                 
     | 
| 67 | 
         
            -
                 
     | 
| 68 | 
         
            -
                 
     | 
| 69 | 
         
            -
                 
     | 
| 70 | 
         
            -
                # add  
     | 
| 71 | 
         
            -
                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 72 | 
         | 
| 73 | 
         
            -
             
     | 
| 74 | 
         
            -
             
     | 
| 75 | 
         
            -
             
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
                cols = list(df.columns)
         
     | 
| 78 | 
         
            -
                cols.insert(1, cols.pop(cols.index('average')))
         
     | 
| 79 | 
         
            -
                df = df.loc[:, cols]
         
     | 
| 80 | 
         
            -
                return df
         
     | 
| 81 | 
         
            -
             
     | 
| 82 | 
         
            -
            benchmark_text = """
         
     | 
| 83 | 
         
            -
            # HERM Results Viewer
         
     | 
| 84 | 
         
            -
             
     | 
| 85 | 
         
            -
            We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
         
     | 
| 86 | 
         
            -
            A win is when the score for the chosen response is higher than the score for the rejected response.
         
     | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
            -
            ### Subset summary
         
     | 
| 89 | 
         | 
| 90 | 
         
            -
             
     | 
| 91 | 
         
            -
             
     | 
| 92 | 
         
            -
             
     | 
| 93 | 
         
            -
             
     | 
| 94 | 
         
            -
            | alpacaeval-hard        |                     805                     | Great model vs baseline model                                     |
         
     | 
| 95 | 
         
            -
            | mt-bench-easy          |                  28, 28                    | MT Bench 10s vs 1s                                                |
         
     | 
| 96 | 
         
            -
            | mt-bench-medium        |                  45, 40                    | MT Bench 9s vs 2-5s                                               |
         
     | 
| 97 | 
         
            -
            | mt-bench-hard          |                  45, 37                    | MT Bench 7-8 vs 5-6                                               |
         
     | 
| 98 | 
         
            -
            | refusals-dangerous     |                     505                     | Dangerous response vs no response                                 |
         
     | 
| 99 | 
         
            -
            | refusals-offensive     |                     704                     | Offensive response vs no response                                 |
         
     | 
| 100 | 
         
            -
            | llmbar-natural         |                     100                     | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
         
     | 
| 101 | 
         
            -
            | llmbar-adver-neighbor  |                     134                     | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
         
     | 
| 102 | 
         
            -
            | llmbar-adver-GPTInst   |                     92                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
         
     | 
| 103 | 
         
            -
            | llmbar-adver-GPTOut    |                     47                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
         
     | 
| 104 | 
         
            -
            | llmbar-adver-manual    |                     46                      | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
         
     | 
| 105 | 
         
            -
            | XSTest                 |                     450                     | TODO curate                                                       |
         
     | 
| 106 | 
         
            -
            | (?) repetitiveness     |                                               |                                                                   |
         
     | 
| 107 | 
         
            -
            | (?) grammar            |                                               |                                                                   |
         
     | 
| 108 | 
         | 
| 109 | 
         
            -
             
     | 
| 110 | 
         
            -
            For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
         
     | 
| 111 | 
         
            -
            """
         
     | 
| 112 | 
         
            -
            leaderboard_data = fetch_and_display_data()
         
     | 
| 113 | 
         
            -
            col_types = ["markdown"] + ["number"] * (len(leaderboard_data.columns) - 1)
         
     | 
| 114 | 
         
             
            with gr.Blocks() as app:
         
     | 
| 
         | 
|
| 115 | 
         
             
                with gr.Row():
         
     | 
| 116 | 
         
            -
                    gr.Markdown( 
     | 
| 117 | 
         
            -
             
     | 
| 118 | 
         
            -
             
     | 
| 119 | 
         
            -
             
     | 
| 120 | 
         
            -
             
     | 
| 121 | 
         
            -
             
     | 
| 122 | 
         
            -
             
     | 
| 123 | 
         
            -
             
     | 
| 124 | 
         
            -
             
     | 
| 125 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 126 | 
         
             
            # Load data when app starts
         
     | 
| 127 | 
         
             
            def load_data_on_start():
         
     | 
| 128 | 
         
            -
                 
     | 
| 129 | 
         
            -
                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 130 | 
         | 
| 131 | 
         
             
            app.launch()
         
     | 
| 
         | 
|
| 1 | 
         
             
            import gradio as gr
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 2 | 
         
             
            import os
         
     | 
| 3 | 
         
            +
            from huggingface_hub import HfApi, snapshot_download
         
     | 
| 4 | 
         
            +
            from src.utils import load_all_data
         
     | 
| 5 | 
         
            +
            from src.md import ABOUT_TEXT
         
     | 
| 6 | 
         
             
            import numpy as np
         
     | 
| 7 | 
         | 
| 8 | 
         
             
            api = HfApi()
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
         
     | 
| 11 | 
         
             
            evals_repo = "ai2-rlhf-collab/rm-benchmark-results"
         
     | 
| 12 | 
         
            +
            prefs_repo = "ai2-rlhf-collab/rm-testset-results"
         
     | 
| 13 | 
         
            +
            repo_dir_herm = "./evals/herm/"
         
     | 
| 14 | 
         
            +
            repo_dir_prefs = "./evals/prefs/"
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
             
            # def restart_space():
         
     | 
| 17 | 
         
             
            #     api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
         
     | 
| 18 | 
         | 
| 19 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 20 | 
         | 
| 21 | 
         
             
            print("Pulling evaluation results")
         
     | 
| 22 | 
         
            +
            repo = snapshot_download(
         
     | 
| 23 | 
         
            +
                local_dir=repo_dir_herm,
         
     | 
| 24 | 
         
            +
                repo_id=evals_repo,
         
     | 
| 25 | 
         
            +
                tqdm_class=None, 
         
     | 
| 26 | 
         
            +
                etag_timeout=30,
         
     | 
| 27 | 
         
             
                repo_type="dataset",
         
     | 
| 28 | 
         
             
            )
         
     | 
| 29 | 
         
            +
            # repo.git_pull()
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         | 
| 31 | 
         
            +
            repo_pref_sets = snapshot_download(
         
     | 
| 32 | 
         
            +
                local_dir=repo_dir_prefs,
         
     | 
| 33 | 
         
            +
                repo_id=prefs_repo,
         
     | 
| 34 | 
         
            +
                use_auth_token=COLLAB_TOKEN,
         
     | 
| 35 | 
         
            +
                tqdm_class=None, 
         
     | 
| 36 | 
         
            +
                etag_timeout=30,
         
     | 
| 37 | 
         
            +
                repo_type="dataset",
         
     | 
| 38 | 
         
            +
            )
         
     | 
| 39 | 
         
            +
            # repo_pref_sets.git_pull()
         
     | 
| 40 | 
         
            +
             
     | 
| 41 | 
         
            +
            def avg_over_herm(dataframe):
         
     | 
| 42 | 
         
            +
                """
         
     | 
| 43 | 
         
            +
                Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
         
     | 
| 44 | 
         
            +
                """
         
     | 
| 45 | 
         
            +
                subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
         
     | 
| 46 | 
         
            +
                # for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
         
     | 
| 47 | 
         
            +
                for subset in subsets:
         
     | 
| 48 | 
         
            +
                    subset_cols = [col for col in dataframe.columns if subset in col]
         
     | 
| 49 | 
         
            +
                    dataframe[subset] = np.round(np.nanmean(dataframe[subset_cols].values, axis=1), 2)
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
                keep_columns = ["model", "average"] + subsets
         
     | 
| 52 | 
         
            +
                dataframe = dataframe[keep_columns]
         
     | 
| 53 | 
         
            +
                # replace average column with new average
         
     | 
| 54 | 
         
            +
                dataframe["average"] = np.round(np.nanmean(dataframe[subsets].values, axis=1), 2)        
         
     | 
| 55 | 
         
            +
                return dataframe
         
     | 
| 56 | 
         
            +
             
     | 
| 57 | 
         
            +
            def expand_subsets(dataframe):
         
     | 
| 58 | 
         
            +
                # TODO need to modify data/ script to do this
         
     | 
| 59 | 
         
            +
                pass
         
     | 
| 60 | 
         | 
| 61 | 
         
            +
            herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False)
         
     | 
| 62 | 
         
            +
            herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
         
     | 
| 63 | 
         
            +
            prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False)
         
     | 
| 64 | 
         
            +
            # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 65 | 
         | 
| 66 | 
         
            +
            col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
         
     | 
| 67 | 
         
            +
            col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1)
         
     | 
| 68 | 
         
            +
            col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
         
     | 
| 69 | 
         
            +
            # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 70 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 71 | 
         
             
            with gr.Blocks() as app:
         
     | 
| 72 | 
         
            +
                # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
         
     | 
| 73 | 
         
             
                with gr.Row():
         
     | 
| 74 | 
         
            +
                    gr.Markdown("# HERM Results Viewer")
         
     | 
| 75 | 
         
            +
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         
     | 
| 76 | 
         
            +
                    with gr.TabItem("HERM - Overview"):
         
     | 
| 77 | 
         
            +
                        with gr.Row():
         
     | 
| 78 | 
         
            +
                            herm_table = gr.Dataframe(
         
     | 
| 79 | 
         
            +
                                herm_data_avg.values,
         
     | 
| 80 | 
         
            +
                                datatype=col_types_herm_avg,
         
     | 
| 81 | 
         
            +
                                headers=herm_data_avg.columns.tolist(),
         
     | 
| 82 | 
         
            +
                                elem_id="herm_dataframe_avg",
         
     | 
| 83 | 
         
            +
                            )
         
     | 
| 84 | 
         
            +
                    with gr.TabItem("HERM - Detailed"):
         
     | 
| 85 | 
         
            +
                        with gr.Row():
         
     | 
| 86 | 
         
            +
                            herm_table = gr.Dataframe(
         
     | 
| 87 | 
         
            +
                                herm_data.values,
         
     | 
| 88 | 
         
            +
                                datatype=col_types_herm,
         
     | 
| 89 | 
         
            +
                                headers=herm_data.columns.tolist(),
         
     | 
| 90 | 
         
            +
                                elem_id="herm_dataframe",
         
     | 
| 91 | 
         
            +
                            )
         
     | 
| 92 | 
         
            +
                    with gr.TabItem("Pref Sets - Overview"):
         
     | 
| 93 | 
         
            +
                            pref_sets_table = gr.Dataframe(
         
     | 
| 94 | 
         
            +
                                prefs_data.values,
         
     | 
| 95 | 
         
            +
                                datatype=col_types_prefs,
         
     | 
| 96 | 
         
            +
                                headers=prefs_data.columns.tolist(),
         
     | 
| 97 | 
         
            +
                                elem_id="prefs_dataframe",
         
     | 
| 98 | 
         
            +
                            )
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
                    with gr.TabItem("About"):
         
     | 
| 101 | 
         
            +
                        with gr.Row():
         
     | 
| 102 | 
         
            +
                            gr.Markdown(ABOUT_TEXT)
         
     | 
| 103 | 
         
            +
                            
         
     | 
| 104 | 
         
             
            # Load data when app starts
         
     | 
| 105 | 
         
             
            def load_data_on_start():
         
     | 
| 106 | 
         
            +
                data_herm = load_all_data(repo_dir_herm)
         
     | 
| 107 | 
         
            +
                herm_table.update(data_herm)
         
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
                data_herm_avg = avg_over_herm(repo_dir_herm)
         
     | 
| 110 | 
         
            +
                herm_table.update(data_herm_avg)
         
     | 
| 111 | 
         
            +
             
     | 
| 112 | 
         
            +
                data_prefs = load_all_data(repo_dir_prefs)
         
     | 
| 113 | 
         
            +
                pref_sets_table.update(data_prefs)
         
     | 
| 114 | 
         | 
| 115 | 
         
             
            app.launch()
         
     | 
    	
        src/md.py
    ADDED
    
    | 
         @@ -0,0 +1,28 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            ABOUT_TEXT = """
         
     | 
| 2 | 
         
            +
            We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
         
     | 
| 3 | 
         
            +
            A win is when the score for the chosen response is higher than the score for the rejected response.
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            ### Subset summary
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            | Subset                 | Num. Samples (Pre-filtering, post-filtering) | Description                                                       |
         
     | 
| 8 | 
         
            +
            | :--------------------- | :------------------------------------------: | :---------------------------------------------------------------- |
         
     | 
| 9 | 
         
            +
            | alpacaeval-easy        |                     805                     | Great model vs poor model                                         |
         
     | 
| 10 | 
         
            +
            | alpacaeval-length      |                     805                     | Good model vs low model, equal length                             |
         
     | 
| 11 | 
         
            +
            | alpacaeval-hard        |                     805                     | Great model vs baseline model                                     |
         
     | 
| 12 | 
         
            +
            | mt-bench-easy          |                  28, 28                    | MT Bench 10s vs 1s                                                |
         
     | 
| 13 | 
         
            +
            | mt-bench-medium        |                  45, 40                    | MT Bench 9s vs 2-5s                                               |
         
     | 
| 14 | 
         
            +
            | mt-bench-hard          |                  45, 37                    | MT Bench 7-8 vs 5-6                                               |
         
     | 
| 15 | 
         
            +
            | refusals-dangerous     |                     505                     | Dangerous response vs no response                                 |
         
     | 
| 16 | 
         
            +
            | refusals-offensive     |                     704                     | Offensive response vs no response                                 |
         
     | 
| 17 | 
         
            +
            | llmbar-natural         |                     100                     | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
         
     | 
| 18 | 
         
            +
            | llmbar-adver-neighbor  |                     134                     | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
         
     | 
| 19 | 
         
            +
            | llmbar-adver-GPTInst   |                     92                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
         
     | 
| 20 | 
         
            +
            | llmbar-adver-GPTOut    |                     47                      | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
         
     | 
| 21 | 
         
            +
            | llmbar-adver-manual    |                     46                      | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
         
     | 
| 22 | 
         
            +
            | XSTest                 |                     450                     | TODO curate                                                       |
         
     | 
| 23 | 
         
            +
            | (?) repetitiveness     |                                               |                                                                   |
         
     | 
| 24 | 
         
            +
            | (?) grammar            |                                               |                                                                   |
         
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
            For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
         
     | 
| 28 | 
         
            +
            """
         
     | 
    	
        src/utils.py
    ADDED
    
    | 
         @@ -0,0 +1,60 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import pandas as pd
         
     | 
| 2 | 
         
            +
            from pathlib import Path
         
     | 
| 3 | 
         
            +
            from datasets import load_dataset
         
     | 
| 4 | 
         
            +
            import numpy as np
         
     | 
| 5 | 
         
            +
            import os
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            # From Open LLM Leaderboard
         
     | 
| 8 | 
         
            +
            def model_hyperlink(link, model_name):
         
     | 
| 9 | 
         
            +
                return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            # Define a function to fetch and process data
         
     | 
| 12 | 
         
            +
            def load_all_data(data_repo, subsubsets=False):    # use HF api to pull the git repo
         
     | 
| 13 | 
         
            +
                dir = Path(data_repo)
         
     | 
| 14 | 
         
            +
                data_dir = dir / "data"
         
     | 
| 15 | 
         
            +
                orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
         
     | 
| 16 | 
         
            +
                # get all files within the sub folders orgs
         
     | 
| 17 | 
         
            +
                models_results = []
         
     | 
| 18 | 
         
            +
                for org in orgs:
         
     | 
| 19 | 
         
            +
                    org_dir = data_dir / org
         
     | 
| 20 | 
         
            +
                    files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
         
     | 
| 21 | 
         
            +
                    for file in files:
         
     | 
| 22 | 
         
            +
                        if file.endswith(".json"):
         
     | 
| 23 | 
         
            +
                            models_results.append(org + "/" + file)
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
                # create empty dataframe to add all data to
         
     | 
| 26 | 
         
            +
                df = pd.DataFrame()
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
                # load all json data in the list models_results one by one to avoid not having the same entries
         
     | 
| 29 | 
         
            +
                for model in models_results:
         
     | 
| 30 | 
         
            +
                    model_data = load_dataset("json", data_files=data_repo + "data/" + model, split="train")
         
     | 
| 31 | 
         
            +
                    df2 = pd.DataFrame(model_data)
         
     | 
| 32 | 
         
            +
                    # add to df
         
     | 
| 33 | 
         
            +
                    df = pd.concat([df2, df])
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
                # remove chat_template comlumn
         
     | 
| 37 | 
         
            +
                df = df.drop(columns=["chat_template"])
         
     | 
| 38 | 
         
            +
             
     | 
| 39 | 
         
            +
                # move column "model" to the front
         
     | 
| 40 | 
         
            +
                cols = list(df.columns)
         
     | 
| 41 | 
         
            +
                cols.insert(0, cols.pop(cols.index('model')))
         
     | 
| 42 | 
         
            +
                df = df.loc[:, cols]
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
                # select all columns except "model"
         
     | 
| 45 | 
         
            +
                cols = df.columns.tolist()
         
     | 
| 46 | 
         
            +
                cols.remove("model")
         
     | 
| 47 | 
         
            +
                # round 
         
     | 
| 48 | 
         
            +
                df[cols] = df[cols].round(2)
         
     | 
| 49 | 
         
            +
                avg = np.nanmean(df[cols].values,axis=1).round(2)
         
     | 
| 50 | 
         
            +
                # add average column
         
     | 
| 51 | 
         
            +
                df["average"] = avg
         
     | 
| 52 | 
         
            +
                
         
     | 
| 53 | 
         
            +
                # apply model_hyperlink function to column "model"
         
     | 
| 54 | 
         
            +
                df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))
         
     | 
| 55 | 
         
            +
             
     | 
| 56 | 
         
            +
                # move average column to the second
         
     | 
| 57 | 
         
            +
                cols = list(df.columns)
         
     | 
| 58 | 
         
            +
                cols.insert(1, cols.pop(cols.index('average')))
         
     | 
| 59 | 
         
            +
                df = df.loc[:, cols]
         
     | 
| 60 | 
         
            +
                return df
         
     |