Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Commit 
							
							·
						
						874c0c9
	
1
								Parent(s):
							
							18596de
								
up
Browse files
    	
        app.py
    CHANGED
    
    | @@ -211,21 +211,24 @@ def regex_table(dataframe, regex, filter_button): | |
| 211 |  | 
| 212 | 
             
                # if Score exists, round to 2 decimals
         | 
| 213 | 
             
                if "Score" in data.columns:
         | 
| 214 | 
            -
                    data["Score"] = data["Score"]. | 
| 215 | 
             
                if "Average" in data.columns:
         | 
| 216 | 
            -
                    data["Average"] = data["Average"]. | 
| 217 | 
             
                # round all others to 1 decimal
         | 
| 218 | 
             
                for col in data.columns:
         | 
| 219 | 
             
                    if col not in ["", "Model", "Model Type", "Score", "Average"]:
         | 
| 220 | 
            -
                        data[col] = data[col]. | 
| 221 | 
             
                return data
         | 
| 222 |  | 
|  | |
|  | |
|  | |
| 223 |  | 
| 224 | 
             
            with gr.Blocks(css=custom_css) as app:
         | 
| 225 | 
             
                # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
         | 
| 226 | 
             
                with gr.Row():
         | 
| 227 | 
             
                    with gr.Column(scale=6):
         | 
| 228 | 
            -
                        gr.Markdown(TOP_TEXT)
         | 
| 229 | 
             
                    with gr.Column(scale=4):
         | 
| 230 | 
             
                        # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
         | 
| 231 | 
             
                        # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
         | 
|  | |
| 211 |  | 
| 212 | 
             
                # if Score exists, round to 2 decimals
         | 
| 213 | 
             
                if "Score" in data.columns:
         | 
| 214 | 
            +
                    data["Score"] = np.round(np.array(data["Score"].values).astype(float), 2)
         | 
| 215 | 
             
                if "Average" in data.columns:
         | 
| 216 | 
            +
                    data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
         | 
| 217 | 
             
                # round all others to 1 decimal
         | 
| 218 | 
             
                for col in data.columns:
         | 
| 219 | 
             
                    if col not in ["", "Model", "Model Type", "Score", "Average"]:
         | 
| 220 | 
            +
                        data[col] = np.round(np.array(data[col].values).astype(float), 1)
         | 
| 221 | 
             
                return data
         | 
| 222 |  | 
| 223 | 
            +
            # import ipdb; ipdb.set_trace()
         | 
| 224 | 
            +
             | 
| 225 | 
            +
            total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values)
         | 
| 226 |  | 
| 227 | 
             
            with gr.Blocks(css=custom_css) as app:
         | 
| 228 | 
             
                # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
         | 
| 229 | 
             
                with gr.Row():
         | 
| 230 | 
             
                    with gr.Column(scale=6):
         | 
| 231 | 
            +
                        gr.Markdown(TOP_TEXT.format(str(total_models)))
         | 
| 232 | 
             
                    with gr.Column(scale=4):
         | 
| 233 | 
             
                        # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
         | 
| 234 | 
             
                        # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
         | 
    	
        src/md.py
    CHANGED
    
    | @@ -97,5 +97,5 @@ For more details, see the [dataset](https://huggingface.co/datasets/allenai/rewa | |
| 97 | 
             
            TOP_TEXT = """
         | 
| 98 | 
             
            # RewardBench: Evaluating Reward Models
         | 
| 99 | 
             
            ### Evaluating the capabilities, safety, and pitfalls of reward models
         | 
| 100 | 
            -
            [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787)
         | 
| 101 | 
             
            """
         | 
|  | |
| 97 | 
             
            TOP_TEXT = """
         | 
| 98 | 
             
            # RewardBench: Evaluating Reward Models
         | 
| 99 | 
             
            ### Evaluating the capabilities, safety, and pitfalls of reward models
         | 
| 100 | 
            +
            [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {}
         | 
| 101 | 
             
            """
         | 

