Spaces:
Sleeping
Sleeping
rounding
Browse files- src/populate.py +31 -6
src/populate.py
CHANGED
|
@@ -166,15 +166,24 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
|
| 166 |
f1_values = []
|
| 167 |
recall_values = []
|
| 168 |
precision_values = []
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
|
| 171 |
if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
# Add overall averages
|
| 180 |
if f1_values:
|
|
@@ -184,6 +193,22 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
|
| 184 |
if precision_values:
|
| 185 |
filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
filtered_entries.append(filtered_entry)
|
| 188 |
|
| 189 |
# Create a new leaderboard data structure with the filtered entries
|
|
|
|
| 166 |
f1_values = []
|
| 167 |
recall_values = []
|
| 168 |
precision_values = []
|
| 169 |
+
accuracy_values = []
|
| 170 |
+
category_recall_values = []
|
| 171 |
+
total_samples = 0
|
| 172 |
|
| 173 |
for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
|
| 174 |
if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
|
| 175 |
+
test_metrics = category_metrics[test_type]
|
| 176 |
+
if "f1_binary" in test_metrics and pd.notna(test_metrics["f1_binary"]):
|
| 177 |
+
f1_values.append(test_metrics["f1_binary"])
|
| 178 |
+
if "recall_binary" in test_metrics and pd.notna(test_metrics["recall_binary"]):
|
| 179 |
+
recall_values.append(test_metrics["recall_binary"])
|
| 180 |
+
category_recall_values.append(test_metrics["recall_binary"])
|
| 181 |
+
if "precision_binary" in test_metrics and pd.notna(test_metrics["precision_binary"]):
|
| 182 |
+
precision_values.append(test_metrics["precision_binary"])
|
| 183 |
+
if "accuracy" in test_metrics and pd.notna(test_metrics["accuracy"]):
|
| 184 |
+
accuracy_values.append(test_metrics["accuracy"])
|
| 185 |
+
if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
|
| 186 |
+
total_samples += test_metrics["sample_count"]
|
| 187 |
|
| 188 |
# Add overall averages
|
| 189 |
if f1_values:
|
|
|
|
| 193 |
if precision_values:
|
| 194 |
filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
|
| 195 |
|
| 196 |
+
# Add category-specific values to standard macro metric keys
|
| 197 |
+
if accuracy_values:
|
| 198 |
+
filtered_entry["macro_accuracy"] = sum(accuracy_values) / len(accuracy_values)
|
| 199 |
+
else:
|
| 200 |
+
filtered_entry["macro_accuracy"] = pd.NA
|
| 201 |
+
|
| 202 |
+
if category_recall_values:
|
| 203 |
+
filtered_entry["macro_recall"] = sum(category_recall_values) / len(category_recall_values)
|
| 204 |
+
else:
|
| 205 |
+
filtered_entry["macro_recall"] = pd.NA
|
| 206 |
+
|
| 207 |
+
if total_samples > 0:
|
| 208 |
+
filtered_entry["total_evals_count"] = total_samples
|
| 209 |
+
else:
|
| 210 |
+
filtered_entry["total_evals_count"] = pd.NA
|
| 211 |
+
|
| 212 |
filtered_entries.append(filtered_entry)
|
| 213 |
|
| 214 |
# Create a new leaderboard data structure with the filtered entries
|