Upload metrics.py with huggingface_hub
Browse files- metrics.py +51 -14
metrics.py
CHANGED
|
@@ -5,9 +5,7 @@ from dataclasses import field
|
|
| 5 |
from typing import Any, Dict, Generator, List, Optional
|
| 6 |
|
| 7 |
import evaluate
|
| 8 |
-
import nltk
|
| 9 |
import numpy
|
| 10 |
-
from editdistance import eval
|
| 11 |
|
| 12 |
from .dataclass import InternalField
|
| 13 |
from .operator import (
|
|
@@ -19,8 +17,6 @@ from .operator import (
|
|
| 19 |
from .operators import CopyFields
|
| 20 |
from .stream import MultiStream, Stream
|
| 21 |
|
| 22 |
-
nltk.download("punkt")
|
| 23 |
-
|
| 24 |
|
| 25 |
def abstract_factory():
|
| 26 |
return {}
|
|
@@ -65,7 +61,8 @@ class GlobalMetric(SingleStreamOperator, Metric):
|
|
| 65 |
try:
|
| 66 |
instance_score = self._compute([refs], [pred])
|
| 67 |
except:
|
| 68 |
-
instance_score = {"score": None}
|
|
|
|
| 69 |
if isinstance(self.main_score, str) and self.main_score is not None:
|
| 70 |
instance_score[self.main_score] = None
|
| 71 |
|
|
@@ -86,6 +83,7 @@ class GlobalMetric(SingleStreamOperator, Metric):
|
|
| 86 |
def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
|
| 87 |
result = self.compute(references, predictions)
|
| 88 |
result["score"] = result[self.main_score]
|
|
|
|
| 89 |
return result
|
| 90 |
|
| 91 |
@abstractmethod
|
|
@@ -131,6 +129,7 @@ class InstanceMetric(SingleStreamOperator, Metric):
|
|
| 131 |
global_score[field] = mean([instance["score"]["instance"][field] for instance in instances])
|
| 132 |
if field == self.main_score:
|
| 133 |
global_score["score"] = global_score[field]
|
|
|
|
| 134 |
|
| 135 |
for instance in instances:
|
| 136 |
yield instance
|
|
@@ -138,6 +137,7 @@ class InstanceMetric(SingleStreamOperator, Metric):
|
|
| 138 |
def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
|
| 139 |
result = self.compute(references=references, predictions=predictions)
|
| 140 |
result["score"] = result[self.main_score]
|
|
|
|
| 141 |
return result
|
| 142 |
|
| 143 |
@abstractmethod
|
|
@@ -147,7 +147,6 @@ class InstanceMetric(SingleStreamOperator, Metric):
|
|
| 147 |
|
| 148 |
class Squad(GlobalMetric):
|
| 149 |
_metric = None
|
| 150 |
-
reduction_map = {"mean": ["f1"]}
|
| 151 |
main_score = "f1"
|
| 152 |
metric = "squad"
|
| 153 |
|
|
@@ -172,6 +171,7 @@ class SingleReferenceInstanceMetric(InstanceMetric):
|
|
| 172 |
def _compute(self, references: List[str], prediction: str) -> dict:
|
| 173 |
result = self.compute(references[0], prediction)
|
| 174 |
result["score"] = result[self.main_score]
|
|
|
|
| 175 |
return result
|
| 176 |
|
| 177 |
@abstractmethod
|
|
@@ -288,6 +288,7 @@ class F1MultiLabel(GlobalMetric):
|
|
| 288 |
_metric = None
|
| 289 |
main_score = "f1_macro"
|
| 290 |
average = None # Report per class then aggregate by mean
|
|
|
|
| 291 |
|
| 292 |
def prepare(self):
|
| 293 |
super(F1MultiLabel, self).prepare()
|
|
@@ -314,17 +315,41 @@ class F1MultiLabel(GlobalMetric):
|
|
| 314 |
len(reference) == 1 for reference in references
|
| 315 |
), "Only a single reference per prediction is allowed in F1 metric"
|
| 316 |
references = [reference[0] for reference in references]
|
| 317 |
-
labels =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
for label in labels:
|
| 319 |
self.add_str_to_id(label)
|
| 320 |
formatted_references = [self.get_one_hot_vector(reference) for reference in references]
|
| 321 |
formatted_predictions = [self.get_one_hot_vector(prediction) for prediction in predictions]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
result = self._metric.compute(
|
| 323 |
-
predictions=formatted_predictions,
|
|
|
|
|
|
|
|
|
|
| 324 |
)
|
| 325 |
if isinstance(result["f1"], numpy.ndarray):
|
| 326 |
from statistics import mean
|
| 327 |
|
|
|
|
|
|
|
|
|
|
| 328 |
final_result = {self.main_score: mean(result["f1"])}
|
| 329 |
for i, label in enumerate(labels):
|
| 330 |
final_result["f1_" + label] = result["f1"][i]
|
|
@@ -348,24 +373,36 @@ class Rouge(HuggingfaceMetric):
|
|
| 348 |
main_score = "rougeL"
|
| 349 |
scale = 1.0
|
| 350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
def compute(self, references, predictions):
|
| 352 |
-
predictions = ["\n".join(
|
| 353 |
-
references = [["\n".join(
|
| 354 |
return super().compute(references, predictions)
|
| 355 |
|
| 356 |
|
| 357 |
-
# Computes chat edit distance, ignoring
|
| 358 |
class CharEditDistanceAccuracy(SingleReferenceInstanceMetric):
|
| 359 |
reduction_map = {"mean": ["char_edit_dist_accuracy"]}
|
| 360 |
main_score = "char_edit_dist_accuracy"
|
| 361 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
def compute(self, reference, prediction: str) -> dict:
|
| 363 |
-
formatted_prediction = "
|
| 364 |
-
formatted_reference = "
|
| 365 |
max_length = max(len(formatted_reference), len(formatted_prediction))
|
| 366 |
if max_length == 0:
|
| 367 |
return 0
|
| 368 |
-
edit_dist = eval(formatted_reference, formatted_prediction)
|
| 369 |
return {"char_edit_dist_accuracy": (1 - edit_dist / max_length)}
|
| 370 |
|
| 371 |
|
|
|
|
| 5 |
from typing import Any, Dict, Generator, List, Optional
|
| 6 |
|
| 7 |
import evaluate
|
|
|
|
| 8 |
import numpy
|
|
|
|
| 9 |
|
| 10 |
from .dataclass import InternalField
|
| 11 |
from .operator import (
|
|
|
|
| 17 |
from .operators import CopyFields
|
| 18 |
from .stream import MultiStream, Stream
|
| 19 |
|
|
|
|
|
|
|
| 20 |
|
| 21 |
def abstract_factory():
|
| 22 |
return {}
|
|
|
|
| 61 |
try:
|
| 62 |
instance_score = self._compute([refs], [pred])
|
| 63 |
except:
|
| 64 |
+
instance_score = {"score": None, "score_name": self.main_score}
|
| 65 |
+
|
| 66 |
if isinstance(self.main_score, str) and self.main_score is not None:
|
| 67 |
instance_score[self.main_score] = None
|
| 68 |
|
|
|
|
| 83 |
def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
|
| 84 |
result = self.compute(references, predictions)
|
| 85 |
result["score"] = result[self.main_score]
|
| 86 |
+
result["score_name"] = self.main_score
|
| 87 |
return result
|
| 88 |
|
| 89 |
@abstractmethod
|
|
|
|
| 129 |
global_score[field] = mean([instance["score"]["instance"][field] for instance in instances])
|
| 130 |
if field == self.main_score:
|
| 131 |
global_score["score"] = global_score[field]
|
| 132 |
+
global_score["score_name"] = self.main_score
|
| 133 |
|
| 134 |
for instance in instances:
|
| 135 |
yield instance
|
|
|
|
| 137 |
def _compute(self, references: List[List[str]], predictions: List[str]) -> dict:
|
| 138 |
result = self.compute(references=references, predictions=predictions)
|
| 139 |
result["score"] = result[self.main_score]
|
| 140 |
+
result["score_name"] = self.main_score
|
| 141 |
return result
|
| 142 |
|
| 143 |
@abstractmethod
|
|
|
|
| 147 |
|
| 148 |
class Squad(GlobalMetric):
|
| 149 |
_metric = None
|
|
|
|
| 150 |
main_score = "f1"
|
| 151 |
metric = "squad"
|
| 152 |
|
|
|
|
| 171 |
def _compute(self, references: List[str], prediction: str) -> dict:
|
| 172 |
result = self.compute(references[0], prediction)
|
| 173 |
result["score"] = result[self.main_score]
|
| 174 |
+
result["score_name"] = self.main_score
|
| 175 |
return result
|
| 176 |
|
| 177 |
@abstractmethod
|
|
|
|
| 288 |
_metric = None
|
| 289 |
main_score = "f1_macro"
|
| 290 |
average = None # Report per class then aggregate by mean
|
| 291 |
+
classes_to_ignore = ["none"]
|
| 292 |
|
| 293 |
def prepare(self):
|
| 294 |
super(F1MultiLabel, self).prepare()
|
|
|
|
| 315 |
len(reference) == 1 for reference in references
|
| 316 |
), "Only a single reference per prediction is allowed in F1 metric"
|
| 317 |
references = [reference[0] for reference in references]
|
| 318 |
+
labels = [
|
| 319 |
+
l
|
| 320 |
+
for l in set([label for reference in references for label in reference])
|
| 321 |
+
if l not in self.classes_to_ignore
|
| 322 |
+
]
|
| 323 |
+
# if no classes are left then F1 is not defined
|
| 324 |
+
# (e.g. only "none" in references)
|
| 325 |
+
if len(labels) == 0:
|
| 326 |
+
return {self.main_score: float("nan")}
|
| 327 |
+
|
| 328 |
for label in labels:
|
| 329 |
self.add_str_to_id(label)
|
| 330 |
formatted_references = [self.get_one_hot_vector(reference) for reference in references]
|
| 331 |
formatted_predictions = [self.get_one_hot_vector(prediction) for prediction in predictions]
|
| 332 |
+
|
| 333 |
+
# There is odd behavior in scikit-learn that when passing a one-hot vector with a single
|
| 334 |
+
# element, it is treated a class identifier. Therefore, we add labels=[1] to limit to only
|
| 335 |
+
# to this class.
|
| 336 |
+
if len(labels) == 1:
|
| 337 |
+
labels_param = [1]
|
| 338 |
+
else:
|
| 339 |
+
labels_param = None
|
| 340 |
+
|
| 341 |
result = self._metric.compute(
|
| 342 |
+
predictions=formatted_predictions,
|
| 343 |
+
references=formatted_references,
|
| 344 |
+
average=self.average,
|
| 345 |
+
labels=labels_param,
|
| 346 |
)
|
| 347 |
if isinstance(result["f1"], numpy.ndarray):
|
| 348 |
from statistics import mean
|
| 349 |
|
| 350 |
+
assert len(result["f1"]) == len(
|
| 351 |
+
labels
|
| 352 |
+
), f'F1 result ({result["f1"]}) has more entries than labels ({labels})'
|
| 353 |
final_result = {self.main_score: mean(result["f1"])}
|
| 354 |
for i, label in enumerate(labels):
|
| 355 |
final_result["f1_" + label] = result["f1"][i]
|
|
|
|
| 373 |
main_score = "rougeL"
|
| 374 |
scale = 1.0
|
| 375 |
|
| 376 |
+
def prepare(self):
|
| 377 |
+
super().prepare()
|
| 378 |
+
import nltk
|
| 379 |
+
|
| 380 |
+
nltk.download("punkt")
|
| 381 |
+
self.sent_tokenize = nltk.sent_tokenize
|
| 382 |
+
|
| 383 |
def compute(self, references, predictions):
|
| 384 |
+
predictions = ["\n".join(self.sent_tokenize(prediction.strip())) for prediction in predictions]
|
| 385 |
+
references = [["\n".join(self.sent_tokenize(r.strip())) for r in reference] for reference in references]
|
| 386 |
return super().compute(references, predictions)
|
| 387 |
|
| 388 |
|
| 389 |
+
# Computes chat edit distance, ignoring whitespace
|
| 390 |
class CharEditDistanceAccuracy(SingleReferenceInstanceMetric):
|
| 391 |
reduction_map = {"mean": ["char_edit_dist_accuracy"]}
|
| 392 |
main_score = "char_edit_dist_accuracy"
|
| 393 |
|
| 394 |
+
def prepare(self):
|
| 395 |
+
import editdistance
|
| 396 |
+
|
| 397 |
+
self.eval = editdistance.eval
|
| 398 |
+
|
| 399 |
def compute(self, reference, prediction: str) -> dict:
|
| 400 |
+
formatted_prediction = "".join(prediction.split())
|
| 401 |
+
formatted_reference = "".join(reference.split())
|
| 402 |
max_length = max(len(formatted_reference), len(formatted_prediction))
|
| 403 |
if max_length == 0:
|
| 404 |
return 0
|
| 405 |
+
edit_dist = self.eval(formatted_reference, formatted_prediction)
|
| 406 |
return {"char_edit_dist_accuracy": (1 - edit_dist / max_length)}
|
| 407 |
|
| 408 |
|