Spaces:

allenai
/

reward-bench

Running

App Files Files Community

natolambert commited on Apr 3, 2024

Commit

1d33a30

1 Parent(s): 9af70d6

add generative default off

Browse files

Files changed (3) hide show

app.py +7 -3
src/md.py +1 -0
src/utils.py +5 -1

app.py CHANGED Viewed

@@ -203,6 +203,8 @@ def regex_table(dataframe, regex, filter_button):
             dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
         if "Custom Classifiers" not in filter_button:
             dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
     # Filter the dataframe such that 'model' contains any of the regex patterns
     data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
@@ -217,6 +219,8 @@ def regex_table(dataframe, regex, filter_button):
     # round all others to 1 decimal
     for col in data.columns:
         if col not in ["", "Model", "Model Type", "Score", "Average"]:
             data[col] = np.round(np.array(data[col].values).astype(float), 1)
     return data
@@ -242,7 +246,7 @@ with gr.Blocks(css=custom_css) as app:
                 search_1 = gr.Textbox(label="Model Search (delimit with , )",
                                       placeholder="Model Search (delimit with , )",
                                       show_label=False)
-                model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
                                                  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
                                                  label="Model Types",
                                                  show_label=False,
@@ -267,7 +271,7 @@ with gr.Blocks(css=custom_css) as app:
         with gr.TabItem("🔍 RewardBench - Detailed"):
             with gr.Row():
                 search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
-                model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
                                                  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
                                                  label="Model Types",
                                                  show_label=False,
@@ -307,7 +311,7 @@ with gr.Blocks(css=custom_css) as app:
         with gr.TabItem("Prior Test Sets"):
             with gr.Row():
                 search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
-                model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
                                                  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
                                                  label="Model Types",
                                                  show_label=False,

             dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
         if "Custom Classifiers" not in filter_button:
             dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
+        if "Generative" not in filter_button:
+            dataframe = dataframe[~dataframe["Model Type"].str.contains("generative", case=False, na=False)]
     # Filter the dataframe such that 'model' contains any of the regex patterns
     data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
     # round all others to 1 decimal
     for col in data.columns:
         if col not in ["", "Model", "Model Type", "Score", "Average"]:
+            # replace any data[col].values == '' with np.NaN
+            data[col] = data[col].replace('', np.NaN)
             data[col] = np.round(np.array(data[col].values).astype(float), 1)
     return data
                 search_1 = gr.Textbox(label="Model Search (delimit with , )",
                                       placeholder="Model Search (delimit with , )",
                                       show_label=False)
+                model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
                                                  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
                                                  label="Model Types",
                                                  show_label=False,
         with gr.TabItem("🔍 RewardBench - Detailed"):
             with gr.Row():
                 search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
+                model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
                                                  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
                                                  label="Model Types",
                                                  show_label=False,
         with gr.TabItem("Prior Test Sets"):
             with gr.Row():
                 search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
+                model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
                                                  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
                                                  label="Model Types",
                                                  show_label=False,

src/md.py CHANGED Viewed

@@ -22,6 +22,7 @@ We include multiple types of reward models in this evaluation:
 2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
 3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
 4. **Random**: Random choice baseline.
 All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
 Others, such as **Generative Judge** are coming soon.

 2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
 3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
 4. **Random**: Random choice baseline.
+4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
 All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
 Others, such as **Generative Judge** are coming soon.

src/utils.py CHANGED Viewed

@@ -9,8 +9,12 @@ import re
 def model_hyperlink(link, model_name):
     if model_name == "random":
         return "random"
-    if model_name == "Cohere March 2024":
         return f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 def undo_hyperlink(html_string):

 def model_hyperlink(link, model_name):
     if model_name == "random":
         return "random"
+    elif model_name == "Cohere March 2024":
         return f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+    elif "openai" == model_name.split("/")[0]:
+        return f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+    elif "Anthropic" == model_name.split("/")[0]:
+        return f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 def undo_hyperlink(html_string):