| from typing import Any, Dict, List | |
| import evaluate | |
| from .api import produce | |
| from .inference import InferenceEngine | |
| from .metrics import BulkInstanceMetric | |
| class LLMAsJudge(BulkInstanceMetric): | |
| """LLM as judge based metric class for evaluating correctness. | |
| Attributes: | |
| main_score (str): The main score used for evaluation. | |
| reduction_map (dict): A dictionary specifying the reduction method for the metric. | |
| betch_size (int): The size of the bulk. | |
| recipe (str): The unitxt recipe that will be used to create the judge dataset. | |
| inference (InferenceEngine): the module that creates the inference. | |
| Methods: | |
| prepare(self): Initialization method for the metric. | |
| compute(self, references, predictions, additional_inputs): Method to compute the metric. | |
| Usage: | |
| metric = LlamaIndexCorrectnessMetric() | |
| scores = metric.compute(references, prediction, additional_inputs) | |
| """ | |
| main_score: str = "llm_as_judge" | |
| reduction_map: Dict[str, List[str]] = None | |
| batch_size: int = 32 | |
| recipe: str | |
| inference_model: InferenceEngine | |
| def prepare(self): | |
| super().prepare() | |
| if self.reduction_map is None: | |
| self.reduction_map = {"mean": [self.main_score]} | |
| def compute( | |
| self, | |
| references: List[List[Any]], | |
| predictions: List[Any], | |
| task_data: List[Dict], | |
| ) -> List[Dict[str, Any]]: | |
| instances = [ | |
| { | |
| **task_data_instance, | |
| **{"model_output": prediction, "rating_label": "[[5]]"}, | |
| } | |
| for task_data_instance, prediction in zip(task_data, predictions) | |
| ] | |
| dataset = produce(instances, self.recipe) | |
| verdicts = self.inference_model.infer(dataset) | |
| meta_metric = evaluate.load("unitxt/metric") | |
| meta_scores = meta_metric.compute(predictions=verdicts, references=dataset) | |
| return [{self.main_score: instance["prediction"]} for instance in meta_scores] | |