Spaces:
Running
Running
Commit
Β·
1d33a30
1
Parent(s):
9af70d6
add generative default off
Browse files- app.py +7 -3
- src/md.py +1 -0
- src/utils.py +5 -1
app.py
CHANGED
|
@@ -203,6 +203,8 @@ def regex_table(dataframe, regex, filter_button):
|
|
| 203 |
dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
|
| 204 |
if "Custom Classifiers" not in filter_button:
|
| 205 |
dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
|
|
|
|
|
|
|
| 206 |
# Filter the dataframe such that 'model' contains any of the regex patterns
|
| 207 |
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
|
| 208 |
|
|
@@ -217,6 +219,8 @@ def regex_table(dataframe, regex, filter_button):
|
|
| 217 |
# round all others to 1 decimal
|
| 218 |
for col in data.columns:
|
| 219 |
if col not in ["", "Model", "Model Type", "Score", "Average"]:
|
|
|
|
|
|
|
| 220 |
data[col] = np.round(np.array(data[col].values).astype(float), 1)
|
| 221 |
return data
|
| 222 |
|
|
@@ -242,7 +246,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
| 242 |
search_1 = gr.Textbox(label="Model Search (delimit with , )",
|
| 243 |
placeholder="Model Search (delimit with , )",
|
| 244 |
show_label=False)
|
| 245 |
-
model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
| 246 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
| 247 |
label="Model Types",
|
| 248 |
show_label=False,
|
|
@@ -267,7 +271,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
| 267 |
with gr.TabItem("π RewardBench - Detailed"):
|
| 268 |
with gr.Row():
|
| 269 |
search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
| 270 |
-
model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
| 271 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
| 272 |
label="Model Types",
|
| 273 |
show_label=False,
|
|
@@ -307,7 +311,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
| 307 |
with gr.TabItem("Prior Test Sets"):
|
| 308 |
with gr.Row():
|
| 309 |
search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
| 310 |
-
model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
| 311 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
| 312 |
label="Model Types",
|
| 313 |
show_label=False,
|
|
|
|
| 203 |
dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
|
| 204 |
if "Custom Classifiers" not in filter_button:
|
| 205 |
dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
|
| 206 |
+
if "Generative" not in filter_button:
|
| 207 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("generative", case=False, na=False)]
|
| 208 |
# Filter the dataframe such that 'model' contains any of the regex patterns
|
| 209 |
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
|
| 210 |
|
|
|
|
| 219 |
# round all others to 1 decimal
|
| 220 |
for col in data.columns:
|
| 221 |
if col not in ["", "Model", "Model Type", "Score", "Average"]:
|
| 222 |
+
# replace any data[col].values == '' with np.NaN
|
| 223 |
+
data[col] = data[col].replace('', np.NaN)
|
| 224 |
data[col] = np.round(np.array(data[col].values).astype(float), 1)
|
| 225 |
return data
|
| 226 |
|
|
|
|
| 246 |
search_1 = gr.Textbox(label="Model Search (delimit with , )",
|
| 247 |
placeholder="Model Search (delimit with , )",
|
| 248 |
show_label=False)
|
| 249 |
+
model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
|
| 250 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
| 251 |
label="Model Types",
|
| 252 |
show_label=False,
|
|
|
|
| 271 |
with gr.TabItem("π RewardBench - Detailed"):
|
| 272 |
with gr.Row():
|
| 273 |
search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
| 274 |
+
model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
|
| 275 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
| 276 |
label="Model Types",
|
| 277 |
show_label=False,
|
|
|
|
| 311 |
with gr.TabItem("Prior Test Sets"):
|
| 312 |
with gr.Row():
|
| 313 |
search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
| 314 |
+
model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
|
| 315 |
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
| 316 |
label="Model Types",
|
| 317 |
show_label=False,
|
src/md.py
CHANGED
|
@@ -22,6 +22,7 @@ We include multiple types of reward models in this evaluation:
|
|
| 22 |
2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
|
| 23 |
3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
|
| 24 |
4. **Random**: Random choice baseline.
|
|
|
|
| 25 |
|
| 26 |
All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
|
| 27 |
Others, such as **Generative Judge** are coming soon.
|
|
|
|
| 22 |
2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
|
| 23 |
3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
|
| 24 |
4. **Random**: Random choice baseline.
|
| 25 |
+
4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
|
| 26 |
|
| 27 |
All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
|
| 28 |
Others, such as **Generative Judge** are coming soon.
|
src/utils.py
CHANGED
|
@@ -9,8 +9,12 @@ import re
|
|
| 9 |
def model_hyperlink(link, model_name):
|
| 10 |
if model_name == "random":
|
| 11 |
return "random"
|
| 12 |
-
|
| 13 |
return f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 15 |
|
| 16 |
def undo_hyperlink(html_string):
|
|
|
|
| 9 |
def model_hyperlink(link, model_name):
|
| 10 |
if model_name == "random":
|
| 11 |
return "random"
|
| 12 |
+
elif model_name == "Cohere March 2024":
|
| 13 |
return f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 14 |
+
elif "openai" == model_name.split("/")[0]:
|
| 15 |
+
return f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 16 |
+
elif "Anthropic" == model_name.split("/")[0]:
|
| 17 |
+
return f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 18 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 19 |
|
| 20 |
def undo_hyperlink(html_string):
|