Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
|
@@ -34,8 +34,10 @@ all_data = pd.concat(
|
|
| 34 |
[data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
|
| 35 |
)
|
| 36 |
|
| 37 |
-
all_model_names = all_data[
|
| 38 |
-
all_text_only_model_names = list(
|
|
|
|
|
|
|
| 39 |
print(all_text_only_model_names)
|
| 40 |
|
| 41 |
## Continue with the cold code --
|
|
@@ -118,8 +120,6 @@ cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
|
|
| 118 |
cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
|
| 119 |
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
def load_heatmap(evt: gr.SelectData):
|
| 124 |
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
|
| 125 |
return heatmap_image
|
|
@@ -142,46 +142,49 @@ def load_cot_vision_heatmap(evt: gr.SelectData):
|
|
| 142 |
|
| 143 |
def calculate_order_by_first_substring(selected_models):
|
| 144 |
|
| 145 |
-
first_columns = all_data[all_data[
|
| 146 |
-
query_ids_df = first_columns[first_columns[
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
# Filter to include only the selected models
|
| 150 |
-
query_ids_df = query_ids_df[query_ids_df['Model Name'].isin(selected_models)]
|
| 151 |
|
| 152 |
print(len(query_ids_df))
|
| 153 |
|
| 154 |
-
query_ids_df = query_ids_df.groupby(
|
|
|
|
|
|
|
| 155 |
|
| 156 |
print(len(query_ids_df))
|
| 157 |
|
| 158 |
query_ids = query_ids_df.query_id.unique()
|
| 159 |
-
# print('query_ids', len(query_ids))
|
| 160 |
-
|
| 161 |
-
# filter out fsm_ids and
|
| 162 |
fsm_ids = query_ids_df.fsm_id.unique()
|
| 163 |
-
print(
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
text_only_filtered
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
return text_only_filtered
|
| 182 |
|
| 183 |
|
| 184 |
-
|
| 185 |
with gr.Blocks() as demo:
|
| 186 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
| 187 |
with gr.Tab("Text-only Benchmark"):
|
|
@@ -227,11 +230,16 @@ with gr.Blocks() as demo:
|
|
| 227 |
with gr.Tab("Constraint Text-only Results"):
|
| 228 |
gr.Markdown("## Constraint Text-only Leaderboard by first substring")
|
| 229 |
included_models = gr.CheckboxGroup(
|
| 230 |
-
label="Models to include",
|
|
|
|
|
|
|
| 231 |
)
|
| 232 |
constrained_leader_board_text = gr.Dataframe()
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
demo.launch()
|
|
|
|
| 34 |
[data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
|
| 35 |
)
|
| 36 |
|
| 37 |
+
all_model_names = all_data["Model Name"].unique()
|
| 38 |
+
all_text_only_model_names = list(
|
| 39 |
+
all_data[all_data["Model Type"] == "Text Only"]["Model Name"].unique()
|
| 40 |
+
)
|
| 41 |
print(all_text_only_model_names)
|
| 42 |
|
| 43 |
## Continue with the cold code --
|
|
|
|
| 120 |
cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
|
| 121 |
|
| 122 |
|
|
|
|
|
|
|
| 123 |
def load_heatmap(evt: gr.SelectData):
|
| 124 |
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
|
| 125 |
return heatmap_image
|
|
|
|
| 142 |
|
| 143 |
def calculate_order_by_first_substring(selected_models):
|
| 144 |
|
| 145 |
+
first_columns = all_data[all_data["substring_index"] == 1]
|
| 146 |
+
query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"]
|
| 147 |
+
query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
print(len(query_ids_df))
|
| 150 |
|
| 151 |
+
query_ids_df = query_ids_df.groupby("query_id").filter(
|
| 152 |
+
lambda x: x["parsed_judge_response"].eq(1).all()
|
| 153 |
+
)
|
| 154 |
|
| 155 |
print(len(query_ids_df))
|
| 156 |
|
| 157 |
query_ids = query_ids_df.query_id.unique()
|
|
|
|
|
|
|
|
|
|
| 158 |
fsm_ids = query_ids_df.fsm_id.unique()
|
| 159 |
+
print(
|
| 160 |
+
"fsm_ids",
|
| 161 |
+
len(fsm_ids),
|
| 162 |
+
"Total of 25 FSM is solvable by everything on the first substring",
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
text_only = all_data[all_data["Model Type"] == "Text Only"]
|
| 166 |
+
text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
|
| 167 |
+
|
| 168 |
+
print(
|
| 169 |
+
f"Number of query_ids from text_only_filtered: {len(text_only_filtered.query_id.unique())}"
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
text_only_filtered = (
|
| 173 |
+
text_only_filtered.groupby(["Model Name"])["parsed_judge_response"]
|
| 174 |
+
.mean()
|
| 175 |
+
.reset_index()
|
| 176 |
+
)
|
| 177 |
+
text_only_filtered["Accuracy"] = text_only_filtered["parsed_judge_response"] * 100
|
| 178 |
+
text_only_filtered.drop("parsed_judge_response", axis=1, inplace=True)
|
| 179 |
+
|
| 180 |
+
text_only_filtered["Accuracy"] = text_only_filtered["Accuracy"].apply(
|
| 181 |
+
lambda x: round(x, 2)
|
| 182 |
+
)
|
| 183 |
+
text_only_filtered.sort_values("Accuracy", ascending=False, inplace=True)
|
| 184 |
|
| 185 |
return text_only_filtered
|
| 186 |
|
| 187 |
|
|
|
|
| 188 |
with gr.Blocks() as demo:
|
| 189 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
| 190 |
with gr.Tab("Text-only Benchmark"):
|
|
|
|
| 230 |
with gr.Tab("Constraint Text-only Results"):
|
| 231 |
gr.Markdown("## Constraint Text-only Leaderboard by first substring")
|
| 232 |
included_models = gr.CheckboxGroup(
|
| 233 |
+
label="Models to include",
|
| 234 |
+
choices=all_text_only_model_names,
|
| 235 |
+
value=all_text_only_model_names,
|
| 236 |
)
|
| 237 |
constrained_leader_board_text = gr.Dataframe()
|
| 238 |
|
| 239 |
+
included_models.input(
|
| 240 |
+
fn=calculate_order_by_first_substring,
|
| 241 |
+
inputs=[included_models],
|
| 242 |
+
outputs=[constrained_leader_board_text],
|
| 243 |
+
)
|
| 244 |
|
| 245 |
demo.launch()
|