Commit
·
91c5b22
1
Parent(s):
93916f2
add tag
Browse files- src/md.py +3 -1
- src/utils.py +20 -7
src/md.py
CHANGED
|
@@ -2,6 +2,8 @@ ABOUT_TEXT = """
|
|
| 2 |
We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
|
| 3 |
A win is when the score for the chosen response is higher than the score for the rejected response.
|
| 4 |
|
|
|
|
|
|
|
| 5 |
## Overview
|
| 6 |
|
| 7 |
We average over 4 core sections (per prompt weighting):
|
|
@@ -93,5 +95,5 @@ For more details, see the [dataset](https://huggingface.co/datasets/allenai/rewa
|
|
| 93 |
TOP_TEXT = """
|
| 94 |
# RewardBench: Evaluating Reward Models
|
| 95 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
| 96 |
-
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {}
|
| 97 |
"""
|
|
|
|
| 2 |
We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
|
| 3 |
A win is when the score for the chosen response is higher than the score for the rejected response.
|
| 4 |
|
| 5 |
+
Note: Models with (*) after the model name are independently submitted model scores which have not been verified by the RewardBench team.
|
| 6 |
+
|
| 7 |
## Overview
|
| 8 |
|
| 9 |
We average over 4 core sections (per prompt weighting):
|
|
|
|
| 95 |
TOP_TEXT = """
|
| 96 |
# RewardBench: Evaluating Reward Models
|
| 97 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
| 98 |
+
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {} | * Unverified models
|
| 99 |
"""
|
src/utils.py
CHANGED
|
@@ -5,24 +5,37 @@ import numpy as np
|
|
| 5 |
import os
|
| 6 |
import re
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
# From Open LLM Leaderboard
|
| 9 |
def model_hyperlink(link, model_name):
|
| 10 |
# if model_name is above 50 characters, return first 47 characters and "..."
|
| 11 |
if len(model_name) > 50:
|
| 12 |
model_name = model_name[:47] + "..."
|
| 13 |
if model_name == "random":
|
| 14 |
-
|
| 15 |
elif model_name == "Cohere March 2024":
|
| 16 |
-
|
| 17 |
elif "openai" == model_name.split("/")[0]:
|
| 18 |
-
|
| 19 |
elif "Anthropic" == model_name.split("/")[0]:
|
| 20 |
-
|
| 21 |
elif "google" == model_name.split("/")[0]:
|
| 22 |
-
|
| 23 |
elif "PoLL" == model_name.split("/")[0]:
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def undo_hyperlink(html_string):
|
| 28 |
# Regex pattern to match content inside > and <
|
|
|
|
| 5 |
import os
|
| 6 |
import re
|
| 7 |
|
| 8 |
+
UNVERIFIED_MODELS = [
|
| 9 |
+
"nvidia/Nemotron-4-340B-Reward",
|
| 10 |
+
"nvidia/Llama3-70B-SteerLM-RM",
|
| 11 |
+
"Cohere May 2024",
|
| 12 |
+
"google/gemini-1.5-pro-0514",
|
| 13 |
+
"Cohere March 2024",
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
# From Open LLM Leaderboard
|
| 17 |
def model_hyperlink(link, model_name):
|
| 18 |
# if model_name is above 50 characters, return first 47 characters and "..."
|
| 19 |
if len(model_name) > 50:
|
| 20 |
model_name = model_name[:47] + "..."
|
| 21 |
if model_name == "random":
|
| 22 |
+
output = "random"
|
| 23 |
elif model_name == "Cohere March 2024":
|
| 24 |
+
output = f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 25 |
elif "openai" == model_name.split("/")[0]:
|
| 26 |
+
output = f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 27 |
elif "Anthropic" == model_name.split("/")[0]:
|
| 28 |
+
output = f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 29 |
elif "google" == model_name.split("/")[0]:
|
| 30 |
+
output = f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 31 |
elif "PoLL" == model_name.split("/")[0]:
|
| 32 |
+
output = model_name
|
| 33 |
+
output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 34 |
+
|
| 35 |
+
if model_name in UNVERIFIED_MODELS:
|
| 36 |
+
return output + " *"
|
| 37 |
+
else:
|
| 38 |
+
return output
|
| 39 |
|
| 40 |
def undo_hyperlink(html_string):
|
| 41 |
# Regex pattern to match content inside > and <
|