Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Commit 
							
							·
						
						56fcfaf
	
1
								Parent(s):
							
							90eea3b
								
length experiment
Browse files- app.py +72 -0
- src/utils.py +5 -0
    	
        app.py
    CHANGED
    
    | @@ -63,14 +63,78 @@ def avg_over_herm(dataframe): | |
| 63 | 
             
            def expand_subsets(dataframe):
         | 
| 64 | 
             
                # TODO need to modify data/ script to do this
         | 
| 65 | 
             
                pass
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 66 |  | 
| 67 | 
             
            herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False)
         | 
| 68 | 
             
            herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
         | 
|  | |
| 69 | 
             
            prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False)
         | 
| 70 | 
             
            # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
         | 
| 71 |  | 
| 72 | 
             
            col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
         | 
| 73 | 
             
            col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1)
         | 
|  | |
| 74 | 
             
            col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
         | 
| 75 | 
             
            # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
         | 
| 76 |  | 
| @@ -114,6 +178,14 @@ with gr.Blocks() as app: | |
| 114 | 
             
                                headers=herm_data.columns.tolist(),
         | 
| 115 | 
             
                                elem_id="herm_dataframe",
         | 
| 116 | 
             
                            )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 117 | 
             
                    with gr.TabItem("Pref Sets - Overview"):
         | 
| 118 | 
             
                            pref_sets_table = gr.Dataframe(
         | 
| 119 | 
             
                                prefs_data.values,
         | 
|  | |
| 63 | 
             
            def expand_subsets(dataframe):
         | 
| 64 | 
             
                # TODO need to modify data/ script to do this
         | 
| 65 | 
             
                pass
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            # reference for length bias categories
         | 
| 68 | 
            +
            length_categories = {
         | 
| 69 | 
            +
                'alpacaeval-easy': 'True',
         | 
| 70 | 
            +
                'alpacaeval-hard': 'True',
         | 
| 71 | 
            +
                'alpacaeval-length': 'Neutral',
         | 
| 72 | 
            +
                'donotanswer': 'False',
         | 
| 73 | 
            +
                'hep-cpp': 'Neutral',
         | 
| 74 | 
            +
                'hep-go': 'Neutral',
         | 
| 75 | 
            +
                'hep-java': 'Neutral',
         | 
| 76 | 
            +
                'hep-js': 'Neutral',
         | 
| 77 | 
            +
                'hep-python': 'Neutral',
         | 
| 78 | 
            +
                'hep-rust': 'Neutral',
         | 
| 79 | 
            +
                'llmbar-adver-GPTInst': 'False',
         | 
| 80 | 
            +
                'llmbar-adver-GPTOut': 'Neutral',
         | 
| 81 | 
            +
                'llmbar-adver-manual': 'False',
         | 
| 82 | 
            +
                'llmbar-adver-neighbor': 'False',
         | 
| 83 | 
            +
                'llmbar-natural': 'Neutral',
         | 
| 84 | 
            +
                'mt-bench-easy': 'False',
         | 
| 85 | 
            +
                'mt-bench-hard': 'False',
         | 
| 86 | 
            +
                'mt-bench-med': 'Neutral',
         | 
| 87 | 
            +
                'refusals-dangerous': 'False',
         | 
| 88 | 
            +
                'refusals-offensive': 'False',
         | 
| 89 | 
            +
                'xstest-should-refuse': 'False',
         | 
| 90 | 
            +
                'xstest-should-respond': 'True'
         | 
| 91 | 
            +
            }
         | 
| 92 | 
            +
             | 
| 93 | 
            +
            def length_bias_check(dataframe):
         | 
| 94 | 
            +
                """
         | 
| 95 | 
            +
                Takes the raw herm dataframe and splits the data into new buckets according to length_categories.
         | 
| 96 | 
            +
                Then, take the average of the three buckets as "average"
         | 
| 97 | 
            +
                """
         | 
| 98 | 
            +
                new_df = dataframe.copy()
         | 
| 99 | 
            +
                existing_subsets = new_df.columns[2:]
         | 
| 100 | 
            +
                final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
         | 
| 101 | 
            +
                # new data is empty list dict for each final subset
         | 
| 102 | 
            +
                new_data = {s: [] for s in final_subsets}
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                # now, subsets correspond to those with True, Nuetral, and False length bias
         | 
| 105 | 
            +
                # check if length_categories[subset] == "True" or "False" or "Neutral"
         | 
| 106 | 
            +
                for subset in existing_subsets:
         | 
| 107 | 
            +
                    subset_data = new_df[subset].values
         | 
| 108 | 
            +
                    subset_length = length_categories[subset]
         | 
| 109 | 
            +
                    # route to the correct bucket
         | 
| 110 | 
            +
                    if subset_length == "True":
         | 
| 111 | 
            +
                        new_data["Length Bias"].append(subset_data)
         | 
| 112 | 
            +
                    elif subset_length == "Neutral":
         | 
| 113 | 
            +
                        new_data["Neutral"].append(subset_data)
         | 
| 114 | 
            +
                    elif subset_length == "False":
         | 
| 115 | 
            +
                        new_data["Terse Bias"].append(subset_data)
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                # take average of new_data and add to new_df (removing other columns than model)
         | 
| 118 | 
            +
                for subset in final_subsets:
         | 
| 119 | 
            +
                    new_df[subset] = np.round(np.nanmean(new_data[subset], axis=0), 2)
         | 
| 120 | 
            +
                keep_columns = ["model"] + final_subsets
         | 
| 121 | 
            +
                new_df = new_df[keep_columns]
         | 
| 122 | 
            +
                # recompute average
         | 
| 123 | 
            +
                # new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                return new_df
         | 
| 126 | 
            +
             | 
| 127 | 
            +
             | 
| 128 |  | 
| 129 | 
             
            herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False)
         | 
| 130 | 
             
            herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
         | 
| 131 | 
            +
            herm_data_length = length_bias_check(herm_data).sort_values(by='Terse Bias', ascending=False)
         | 
| 132 | 
             
            prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False)
         | 
| 133 | 
             
            # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
         | 
| 134 |  | 
| 135 | 
             
            col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
         | 
| 136 | 
             
            col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1)
         | 
| 137 | 
            +
            cols_herm_data_length = ["markdown"] + ["number"] * (len(herm_data_length.columns) - 1)
         | 
| 138 | 
             
            col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
         | 
| 139 | 
             
            # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
         | 
| 140 |  | 
|  | |
| 178 | 
             
                                headers=herm_data.columns.tolist(),
         | 
| 179 | 
             
                                elem_id="herm_dataframe",
         | 
| 180 | 
             
                            )
         | 
| 181 | 
            +
                    with gr.TabItem("HERM - Length Bias"):
         | 
| 182 | 
            +
                        with gr.Row():
         | 
| 183 | 
            +
                            herm_table = gr.Dataframe(
         | 
| 184 | 
            +
                                herm_data_length.values,
         | 
| 185 | 
            +
                                datatype=cols_herm_data_length,
         | 
| 186 | 
            +
                                headers=herm_data_length.columns.tolist(),
         | 
| 187 | 
            +
                                elem_id="herm_dataframe_length",
         | 
| 188 | 
            +
                            )
         | 
| 189 | 
             
                    with gr.TabItem("Pref Sets - Overview"):
         | 
| 190 | 
             
                            pref_sets_table = gr.Dataframe(
         | 
| 191 | 
             
                                prefs_data.values,
         | 
    	
        src/utils.py
    CHANGED
    
    | @@ -62,4 +62,9 @@ def load_all_data(data_repo, subsubsets=False):    # use HF api to pull the git | |
| 62 | 
             
                cols = list(df.columns)
         | 
| 63 | 
             
                cols.insert(1, cols.pop(cols.index('average')))
         | 
| 64 | 
             
                df = df.loc[:, cols]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 65 | 
             
                return df
         | 
|  | |
| 62 | 
             
                cols = list(df.columns)
         | 
| 63 | 
             
                cols.insert(1, cols.pop(cols.index('average')))
         | 
| 64 | 
             
                df = df.loc[:, cols]
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                # remove columns xstest (outdated data)
         | 
| 67 | 
            +
                # if xstest is a column
         | 
| 68 | 
            +
                if "xstest" in df.columns:
         | 
| 69 | 
            +
                    df = df.drop(columns=["xstest"])
         | 
| 70 | 
             
                return df
         | 

