Spaces:
Runtime error
Runtime error
normalize scores to majority class baseline
Browse files
src/leaderboard/read_evals.py
CHANGED
|
@@ -160,13 +160,23 @@ class EvalResult:
|
|
| 160 |
|
| 161 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
| 162 |
|
| 163 |
-
average = sum([v for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
| 164 |
-
average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
| 165 |
-
average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
| 166 |
-
|
| 167 |
-
#
|
| 168 |
-
#
|
| 169 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
data_dict = {}
|
| 172 |
# data_dict = {
|
|
|
|
| 160 |
|
| 161 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
| 162 |
|
| 163 |
+
# average = sum([v for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
| 164 |
+
# average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
| 165 |
+
# average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
| 166 |
+
# print('XXXXXXXXXXXX')
|
| 167 |
+
# print(self.eval_name)
|
| 168 |
+
# print(all_tasks)
|
| 169 |
+
# print(baselines)
|
| 170 |
+
# print(self.results)
|
| 171 |
+
# print('XXXXXXXXXXXX')
|
| 172 |
+
|
| 173 |
+
# average = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in all_tasks]) / len(all_tasks)
|
| 174 |
+
# average_g = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in g_tasks]) / len(g_tasks)
|
| 175 |
+
# average_mc = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in mc_tasks]) / len(mc_tasks)
|
| 176 |
+
|
| 177 |
+
average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
|
| 178 |
+
average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)
|
| 179 |
+
average_mc = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in mc_tasks]) / len(mc_tasks)
|
| 180 |
|
| 181 |
data_dict = {}
|
| 182 |
# data_dict = {
|