Spaces:
Running
Running
Commit
·
e5d5995
1
Parent(s):
8e499f4
smol improvements
Browse files- app.py +38 -21
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
from huggingface_hub import HfApi, snapshot_download
|
|
|
|
| 4 |
from datasets import load_dataset
|
| 5 |
from src.utils import load_all_data
|
| 6 |
from src.md import ABOUT_TEXT
|
|
@@ -15,10 +16,8 @@ eval_set_repo = "ai2-rlhf-collab/rm-benchmark-dev"
|
|
| 15 |
repo_dir_herm = "./evals/herm/"
|
| 16 |
repo_dir_prefs = "./evals/prefs/"
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
|
| 23 |
print("Pulling evaluation results")
|
| 24 |
repo = snapshot_download(
|
|
@@ -43,17 +42,18 @@ def avg_over_herm(dataframe):
|
|
| 43 |
"""
|
| 44 |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
| 45 |
"""
|
|
|
|
| 46 |
subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
|
| 47 |
# for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
|
| 48 |
for subset in subsets:
|
| 49 |
-
subset_cols = [col for col in
|
| 50 |
-
|
| 51 |
|
| 52 |
keep_columns = ["model", "average"] + subsets
|
| 53 |
-
|
| 54 |
# replace average column with new average
|
| 55 |
-
|
| 56 |
-
return
|
| 57 |
|
| 58 |
def expand_subsets(dataframe):
|
| 59 |
# TODO need to modify data/ script to do this
|
|
@@ -71,12 +71,23 @@ col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
|
|
| 71 |
|
| 72 |
# for showing random samples
|
| 73 |
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
|
| 74 |
-
def random_sample(r: gr.Request):
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
markdown_text = '\n\n'.join([f"**{key}**: {value}" for key, value in sample.items()])
|
| 78 |
return markdown_text
|
| 79 |
|
|
|
|
|
|
|
| 80 |
with gr.Blocks() as app:
|
| 81 |
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
|
| 82 |
with gr.Row():
|
|
@@ -114,23 +125,29 @@ with gr.Blocks() as app:
|
|
| 114 |
with gr.Row():
|
| 115 |
# loads one sample
|
| 116 |
gr.Markdown("## Random Dataset Sample Viewer")
|
|
|
|
| 117 |
button = gr.Button("Show Random Sample")
|
| 118 |
|
| 119 |
with gr.Row():
|
| 120 |
sample_display = gr.Markdown("{sampled data loads here}")
|
| 121 |
|
| 122 |
-
button.click(fn=random_sample, outputs=sample_display)
|
| 123 |
|
| 124 |
|
| 125 |
# Load data when app starts, TODO make this used somewhere...
|
| 126 |
-
def load_data_on_start():
|
| 127 |
-
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
|
|
|
| 132 |
|
| 133 |
-
data_prefs = load_all_data(repo_dir_prefs)
|
| 134 |
-
pref_sets_table.update(data_prefs)
|
| 135 |
|
| 136 |
-
app.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
from huggingface_hub import HfApi, snapshot_download
|
| 4 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
from datasets import load_dataset
|
| 6 |
from src.utils import load_all_data
|
| 7 |
from src.md import ABOUT_TEXT
|
|
|
|
| 16 |
repo_dir_herm = "./evals/herm/"
|
| 17 |
repo_dir_prefs = "./evals/prefs/"
|
| 18 |
|
| 19 |
+
def restart_space():
|
| 20 |
+
api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
|
|
|
|
|
|
|
| 21 |
|
| 22 |
print("Pulling evaluation results")
|
| 23 |
repo = snapshot_download(
|
|
|
|
| 42 |
"""
|
| 43 |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
| 44 |
"""
|
| 45 |
+
new_df = dataframe.copy()
|
| 46 |
subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
|
| 47 |
# for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
|
| 48 |
for subset in subsets:
|
| 49 |
+
subset_cols = [col for col in new_df.columns if subset in col]
|
| 50 |
+
new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
|
| 51 |
|
| 52 |
keep_columns = ["model", "average"] + subsets
|
| 53 |
+
new_df = new_df[keep_columns]
|
| 54 |
# replace average column with new average
|
| 55 |
+
new_df["average"] = np.round(np.nanmean(new_df[subsets].values, axis=1), 2)
|
| 56 |
+
return new_df
|
| 57 |
|
| 58 |
def expand_subsets(dataframe):
|
| 59 |
# TODO need to modify data/ script to do this
|
|
|
|
| 71 |
|
| 72 |
# for showing random samples
|
| 73 |
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
|
| 74 |
+
def random_sample(r: gr.Request, subset):
|
| 75 |
+
if subset is None or subset == []:
|
| 76 |
+
sample_index = np.random.randint(0, len(eval_set) - 1)
|
| 77 |
+
sample = eval_set[sample_index]
|
| 78 |
+
else: # filter by subsets (can be list)
|
| 79 |
+
if isinstance(subset, str):
|
| 80 |
+
subset = [subset]
|
| 81 |
+
# filter down dataset to only include the subset(s)
|
| 82 |
+
eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
|
| 83 |
+
sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
|
| 84 |
+
sample = eval_set_filtered[sample_index]
|
| 85 |
+
|
| 86 |
markdown_text = '\n\n'.join([f"**{key}**: {value}" for key, value in sample.items()])
|
| 87 |
return markdown_text
|
| 88 |
|
| 89 |
+
subsets = eval_set.unique("subset")
|
| 90 |
+
|
| 91 |
with gr.Blocks() as app:
|
| 92 |
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
|
| 93 |
with gr.Row():
|
|
|
|
| 125 |
with gr.Row():
|
| 126 |
# loads one sample
|
| 127 |
gr.Markdown("## Random Dataset Sample Viewer")
|
| 128 |
+
subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
|
| 129 |
button = gr.Button("Show Random Sample")
|
| 130 |
|
| 131 |
with gr.Row():
|
| 132 |
sample_display = gr.Markdown("{sampled data loads here}")
|
| 133 |
|
| 134 |
+
button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
|
| 135 |
|
| 136 |
|
| 137 |
# Load data when app starts, TODO make this used somewhere...
|
| 138 |
+
# def load_data_on_start():
|
| 139 |
+
# data_herm = load_all_data(repo_dir_herm)
|
| 140 |
+
# herm_table.update(data_herm)
|
| 141 |
+
|
| 142 |
+
# data_herm_avg = avg_over_herm(repo_dir_herm)
|
| 143 |
+
# herm_table.update(data_herm_avg)
|
| 144 |
+
|
| 145 |
+
# data_prefs = load_all_data(repo_dir_prefs)
|
| 146 |
+
# pref_sets_table.update(data_prefs)
|
| 147 |
|
| 148 |
+
scheduler = BackgroundScheduler()
|
| 149 |
+
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
|
| 150 |
+
scheduler.start()
|
| 151 |
|
|
|
|
|
|
|
| 152 |
|
| 153 |
+
app.queue().launch()
|
requirements.txt
CHANGED
|
@@ -1,2 +1,3 @@
|
|
|
|
|
| 1 |
pandas
|
| 2 |
datasets
|
|
|
|
| 1 |
+
APScheduler==3.10.1
|
| 2 |
pandas
|
| 3 |
datasets
|