Spaces:

unitxt
/

metric

Running

App Files Files Community

metric / llm_as_judge.py

Elron

Upload llm_as_judge.py with huggingface_hub

f418928 verified over 1 year ago

raw

history blame

2.01 kB

	from typing import Any, Dict, List

	import evaluate

	from .api import produce
	from .inference import InferenceEngine
	from .metrics import BulkInstanceMetric


	class LLMAsJudge(BulkInstanceMetric):
	"""LLM as judge based metric class for evaluating correctness.

	Attributes:
	main_score (str): The main score used for evaluation.
	reduction_map (dict): A dictionary specifying the reduction method for the metric.
	betch_size (int): The size of the bulk.
	recipe (str): The unitxt recipe that will be used to create the judge dataset.
	inference (InferenceEngine): the module that creates the inference.

	Methods:
	prepare(self): Initialization method for the metric.
	compute(self, references, predictions, additional_inputs): Method to compute the metric.

	Usage:
	metric = LlamaIndexCorrectnessMetric()
	scores = metric.compute(references, prediction, additional_inputs)
	"""

	main_score: str = "llm_as_judge"
	reduction_map: Dict[str, List[str]] = None
	batch_size: int = 32
	recipe: str
	inference_model: InferenceEngine

	def prepare(self):
	super().prepare()
	if self.reduction_map is None:
	self.reduction_map = {"mean": [self.main_score]}

	def compute(
	self,
	references: List[List[Any]],
	predictions: List[Any],
	task_data: List[Dict],
	) -> List[Dict[str, Any]]:
	instances = [
	{
	**task_data_instance,
	**{"model_output": prediction, "rating_label": "[[5]]"},
	}
	for task_data_instance, prediction in zip(task_data, predictions)
	]

	dataset = produce(instances, self.recipe)
	verdicts = self.inference_model.infer(dataset)
	meta_metric = evaluate.load("unitxt/metric")
	meta_scores = meta_metric.compute(predictions=verdicts, references=dataset)
	return [{self.main_score: instance["prediction"]} for instance in meta_scores]