Spaces:

allenai
/

reward-bench

Running

natolambert commited on Mar 26, 2024

Commit

874c0c9

1 Parent(s): 18596de

up

Files changed (2) hide show

app.py CHANGED Viewed

@@ -211,21 +211,24 @@ def regex_table(dataframe, regex, filter_button):
     # if Score exists, round to 2 decimals
     if "Score" in data.columns:
-        data["Score"] = data["Score"].round(2)
     if "Average" in data.columns:
-        data["Average"] = data["Average"].round(1)
     # round all others to 1 decimal
     for col in data.columns:
         if col not in ["", "Model", "Model Type", "Score", "Average"]:
-            data[col] = data[col].round(1)
     return data
 with gr.Blocks(css=custom_css) as app:
     # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
     with gr.Row():
         with gr.Column(scale=6):
-            gr.Markdown(TOP_TEXT)
         with gr.Column(scale=4):
             # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
             # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)

     # if Score exists, round to 2 decimals
     if "Score" in data.columns:
+        data["Score"] = np.round(np.array(data["Score"].values).astype(float), 2)
     if "Average" in data.columns:
+        data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
     # round all others to 1 decimal
     for col in data.columns:
         if col not in ["", "Model", "Model Type", "Score", "Average"]:
+            data[col] = np.round(np.array(data[col].values).astype(float), 1)
     return data
+# import ipdb; ipdb.set_trace()
+total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values)
 with gr.Blocks(css=custom_css) as app:
     # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
     with gr.Row():
         with gr.Column(scale=6):
+            gr.Markdown(TOP_TEXT.format(str(total_models)))
         with gr.Column(scale=4):
             # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
             # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)

src/md.py CHANGED Viewed

@@ -97,5 +97,5 @@ For more details, see the [dataset](https://huggingface.co/datasets/allenai/rewa
 TOP_TEXT = """
 # RewardBench: Evaluating Reward Models
 ### Evaluating the capabilities, safety, and pitfalls of reward models
-[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787)
 """

 TOP_TEXT = """
 # RewardBench: Evaluating Reward Models
 ### Evaluating the capabilities, safety, and pitfalls of reward models
+[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {}
 """