CodeReviewBench

Sleeping

App Files Files Community

apsys commited on Apr 24

Commit

b4647b5

1 Parent(s): 2de7cd0

rounding

Browse files

Files changed (1) hide show

src/populate.py +31 -6

src/populate.py CHANGED Viewed

@@ -166,15 +166,24 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
             f1_values = []
             recall_values = []
             precision_values = []
             for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
                 if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
-                    if "f1_binary" in category_metrics[test_type]:
-                        f1_values.append(category_metrics[test_type]["f1_binary"])
-                    if "recall_binary" in category_metrics[test_type]:
-                        recall_values.append(category_metrics[test_type]["recall_binary"])
-                    if "precision_binary" in category_metrics[test_type]:
-                        precision_values.append(category_metrics[test_type]["precision_binary"])
             # Add overall averages
             if f1_values:
@@ -184,6 +193,22 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
             if precision_values:
                 filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
             filtered_entries.append(filtered_entry)
     # Create a new leaderboard data structure with the filtered entries

             f1_values = []
             recall_values = []
             precision_values = []
+            accuracy_values = []
+            category_recall_values = []
+            total_samples = 0
             for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
                 if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
+                    test_metrics = category_metrics[test_type]
+                    if "f1_binary" in test_metrics and pd.notna(test_metrics["f1_binary"]):
+                        f1_values.append(test_metrics["f1_binary"])
+                    if "recall_binary" in test_metrics and pd.notna(test_metrics["recall_binary"]):
+                        recall_values.append(test_metrics["recall_binary"])
+                        category_recall_values.append(test_metrics["recall_binary"])
+                    if "precision_binary" in test_metrics and pd.notna(test_metrics["precision_binary"]):
+                        precision_values.append(test_metrics["precision_binary"])
+                    if "accuracy" in test_metrics and pd.notna(test_metrics["accuracy"]):
+                        accuracy_values.append(test_metrics["accuracy"])
+                    if "sample_count" in test_metrics and pd.notna(test_metrics["sample_count"]):
+                        total_samples += test_metrics["sample_count"]
             # Add overall averages
             if f1_values:
             if precision_values:
                 filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
+            # Add category-specific values to standard macro metric keys
+            if accuracy_values:
+                filtered_entry["macro_accuracy"] = sum(accuracy_values) / len(accuracy_values)
+            else:
+                filtered_entry["macro_accuracy"] = pd.NA
+            if category_recall_values:
+                filtered_entry["macro_recall"] = sum(category_recall_values) / len(category_recall_values)
+            else:
+                filtered_entry["macro_recall"] = pd.NA
+            if total_samples > 0:
+                filtered_entry["total_evals_count"] = total_samples
+            else:
+                filtered_entry["total_evals_count"] = pd.NA
             filtered_entries.append(filtered_entry)
     # Create a new leaderboard data structure with the filtered entries