|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Any, Callable, Dict, List, Literal, Optional, Sequence |
|
|
|
|
|
import numpy as np |
|
|
from datasets import Dataset, load_dataset |
|
|
|
|
|
from camel.agents import ChatAgent |
|
|
from camel.benchmarks import BaseBenchmark |
|
|
from camel.logger import get_logger |
|
|
from camel.retrievers import AutoRetriever |
|
|
|
|
|
logger = get_logger(__name__) |
|
|
|
|
|
|
|
|
class RagasFields: |
|
|
r"""Constants for RAGAS evaluation field names.""" |
|
|
|
|
|
INPUT_CONTEXT = "contexts" |
|
|
INPUT_QUESTION = "question" |
|
|
INPUT_ANSWER = "answer" |
|
|
|
|
|
|
|
|
def annotate_dataset( |
|
|
dataset: Dataset, |
|
|
context_call: Optional[Callable[[Dict[str, Any]], List[str]]], |
|
|
answer_call: Optional[Callable[[Dict[str, Any]], str]], |
|
|
) -> Dataset: |
|
|
r"""Annotate the dataset by adding context and answers using the provided |
|
|
functions. |
|
|
|
|
|
Args: |
|
|
dataset (Dataset): The input dataset to annotate. |
|
|
context_call (Optional[Callable[[Dict[str, Any]], List[str]]]): |
|
|
Function to generate context for each example. |
|
|
answer_call (Optional[Callable[[Dict[str, Any]], str]]): Function to |
|
|
generate answer for each example. |
|
|
|
|
|
Returns: |
|
|
Dataset: The annotated dataset with added contexts and/or answers. |
|
|
""" |
|
|
|
|
|
def process_example(example: Dict[str, Any]) -> Dict[str, Any]: |
|
|
if context_call: |
|
|
example["contexts"] = context_call(example) |
|
|
if answer_call: |
|
|
example["answer"] = answer_call(example) |
|
|
return example |
|
|
|
|
|
return dataset.map(process_example) |
|
|
|
|
|
|
|
|
def rmse( |
|
|
input_trues: Sequence[float], |
|
|
input_preds: Sequence[float], |
|
|
) -> Optional[float]: |
|
|
r"""Calculate Root Mean Squared Error (RMSE). |
|
|
|
|
|
Args: |
|
|
input_trues (Sequence[float]): Ground truth values. |
|
|
input_preds (Sequence[float]): Predicted values. |
|
|
|
|
|
Returns: |
|
|
Optional[float]: RMSE value, or None if inputs have different lengths. |
|
|
""" |
|
|
if len(input_trues) != len(input_preds): |
|
|
logger.warning("Input lengths mismatch in RMSE calculation") |
|
|
return None |
|
|
|
|
|
trues = np.array(input_trues) |
|
|
preds = np.array(input_preds, dtype=float) |
|
|
|
|
|
|
|
|
eval_idx = ~np.isnan(preds) |
|
|
if not np.any(eval_idx): |
|
|
logger.warning("No valid predictions for RMSE calculation") |
|
|
return None |
|
|
|
|
|
trues = trues[eval_idx] |
|
|
preds = preds[eval_idx] |
|
|
|
|
|
return float(np.sqrt(np.mean((preds - trues) ** 2))) |
|
|
|
|
|
|
|
|
def auroc(trues: Sequence[bool], preds: Sequence[float]) -> float: |
|
|
r"""Calculate Area Under Receiver Operating Characteristic Curve (AUROC). |
|
|
|
|
|
Args: |
|
|
trues (Sequence[bool]): Ground truth binary values. |
|
|
preds (Sequence[float]): Predicted probability values. |
|
|
|
|
|
Returns: |
|
|
float: AUROC score. |
|
|
""" |
|
|
from sklearn.metrics import roc_auc_score |
|
|
|
|
|
eval_idx = ~np.isnan(preds) |
|
|
if not np.any(eval_idx): |
|
|
logger.warning("No valid predictions for AUROC calculation") |
|
|
return 0.5 |
|
|
|
|
|
return float( |
|
|
roc_auc_score(np.array(trues)[eval_idx], np.array(preds)[eval_idx]) |
|
|
) |
|
|
|
|
|
|
|
|
def ragas_calculate_metrics( |
|
|
dataset: Dataset, |
|
|
pred_context_relevance_field: Optional[str], |
|
|
pred_faithfulness_field: Optional[str], |
|
|
metrics_to_evaluate: Optional[List[str]] = None, |
|
|
ground_truth_context_relevance_field: str = "relevance_score", |
|
|
ground_truth_faithfulness_field: str = "adherence_score", |
|
|
) -> Dict[str, Optional[float]]: |
|
|
r"""Calculate RAGAS evaluation metrics. |
|
|
|
|
|
Args: |
|
|
dataset (Dataset): The dataset containing predictions and ground truth. |
|
|
pred_context_relevance_field (Optional[str]): Field name for predicted |
|
|
context relevance. |
|
|
pred_faithfulness_field (Optional[str]): Field name for predicted |
|
|
faithfulness. |
|
|
metrics_to_evaluate (Optional[List[str]]): List of metrics to evaluate. |
|
|
ground_truth_context_relevance_field (str): Field name for ground truth |
|
|
relevance. |
|
|
ground_truth_faithfulness_field (str): Field name for ground truth |
|
|
adherence. |
|
|
|
|
|
Returns: |
|
|
Dict[str, Optional[float]]: Dictionary of calculated metrics. |
|
|
""" |
|
|
metrics_to_evaluate = metrics_to_evaluate or [ |
|
|
"context_relevancy", |
|
|
"faithfulness", |
|
|
] |
|
|
calculated_metrics: Dict[str, Optional[float]] = {} |
|
|
|
|
|
if ( |
|
|
"context_relevancy" in metrics_to_evaluate |
|
|
and pred_context_relevance_field |
|
|
): |
|
|
trues_relevance = dataset[ground_truth_context_relevance_field] |
|
|
preds_relevance = dataset[pred_context_relevance_field] |
|
|
calculated_metrics["relevance_rmse"] = rmse( |
|
|
trues_relevance, preds_relevance |
|
|
) |
|
|
|
|
|
if "faithfulness" in metrics_to_evaluate and pred_faithfulness_field: |
|
|
trues_hallucination = ~np.array( |
|
|
dataset[ground_truth_faithfulness_field] |
|
|
) |
|
|
preds_hallucination = 1 - np.array( |
|
|
dataset[pred_faithfulness_field], dtype=float |
|
|
) |
|
|
calculated_metrics["hallucination_auroc"] = auroc( |
|
|
trues_hallucination.tolist(), preds_hallucination.tolist() |
|
|
) |
|
|
|
|
|
return calculated_metrics |
|
|
|
|
|
|
|
|
def ragas_evaluate_dataset( |
|
|
dataset: Dataset, |
|
|
contexts_field_name: Optional[str], |
|
|
answer_field_name: Optional[str], |
|
|
metrics_to_evaluate: Optional[List[str]] = None, |
|
|
) -> Dataset: |
|
|
r"""Evaluate the dataset using RAGAS metrics. |
|
|
|
|
|
Args: |
|
|
dataset (Dataset): Input dataset to evaluate. |
|
|
contexts_field_name (Optional[str]): Field name containing contexts. |
|
|
answer_field_name (Optional[str]): Field name containing answers. |
|
|
metrics_to_evaluate (Optional[List[str]]): List of metrics to evaluate. |
|
|
|
|
|
Returns: |
|
|
Dataset: Dataset with added evaluation metrics. |
|
|
""" |
|
|
from ragas import evaluate |
|
|
from ragas.metrics import ( |
|
|
context_relevancy, |
|
|
faithfulness, |
|
|
) |
|
|
|
|
|
metrics_to_evaluate = metrics_to_evaluate or [ |
|
|
"context_relevancy", |
|
|
"faithfulness", |
|
|
] |
|
|
|
|
|
|
|
|
if ( |
|
|
contexts_field_name |
|
|
and contexts_field_name != RagasFields.INPUT_CONTEXT |
|
|
): |
|
|
dataset = dataset.rename_column( |
|
|
contexts_field_name, RagasFields.INPUT_CONTEXT |
|
|
) |
|
|
if answer_field_name and answer_field_name != RagasFields.INPUT_ANSWER: |
|
|
dataset = dataset.rename_column( |
|
|
answer_field_name, RagasFields.INPUT_ANSWER |
|
|
) |
|
|
|
|
|
metrics = [] |
|
|
if "context_relevancy" in metrics_to_evaluate: |
|
|
metrics.append(context_relevancy) |
|
|
if "faithfulness" in metrics_to_evaluate: |
|
|
metrics.append(faithfulness) |
|
|
|
|
|
ragas_result = evaluate(dataset, metrics=metrics) |
|
|
return Dataset.from_pandas(ragas_result.to_pandas()) |
|
|
|
|
|
|
|
|
class RAGBenchBenchmark(BaseBenchmark): |
|
|
r"""RAGBench Benchmark for evaluating RAG performance. |
|
|
|
|
|
This benchmark uses the rungalileo/ragbench dataset to evaluate |
|
|
retrieval-augmented generation (RAG) systems. It measures context |
|
|
relevancy and faithfulness metrics as described in |
|
|
https://arxiv.org/abs/2407.11005. |
|
|
|
|
|
Args: |
|
|
processes (int, optional): Number of processes for parallel processing. |
|
|
subset (str, optional): Dataset subset to use (e.g., "hotpotqa"). |
|
|
split (str, optional): Dataset split to use (e.g., "test"). |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
processes: int = 1, |
|
|
subset: Literal[ |
|
|
"covidqa", |
|
|
"cuad", |
|
|
"delucionqa", |
|
|
"emanual", |
|
|
"expertqa", |
|
|
"finqa", |
|
|
"hagrid", |
|
|
"hotpotqa", |
|
|
"msmarco", |
|
|
"pubmedqa", |
|
|
"tatqa", |
|
|
"techqa", |
|
|
] = "hotpotqa", |
|
|
split: Literal["train", "test", "validation"] = "test", |
|
|
) -> None: |
|
|
super().__init__("ragbench", "rag_bench", "", processes) |
|
|
self.subset = subset |
|
|
self.split = split |
|
|
self.dataset: Optional[Dataset] = None |
|
|
|
|
|
def download(self): |
|
|
r"""Download the RAGBench dataset.""" |
|
|
try: |
|
|
self.dataset = load_dataset( |
|
|
"rungalileo/ragbench", self.subset, split=self.split |
|
|
) |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to download dataset: {e}") |
|
|
raise |
|
|
|
|
|
def load(self, force_download: bool = False): |
|
|
r"""Load the RAGBench dataset. |
|
|
|
|
|
Args: |
|
|
force_download (bool, optional): Whether to force download the |
|
|
data. |
|
|
""" |
|
|
if force_download or self.dataset is None: |
|
|
logger.info( |
|
|
"%s dataset", |
|
|
"Force downloading" if force_download else "Loading", |
|
|
) |
|
|
self.download() |
|
|
|
|
|
def run( |
|
|
self, |
|
|
agent: ChatAgent, |
|
|
auto_retriever: AutoRetriever, |
|
|
) -> Dict[str, Optional[float]]: |
|
|
r"""Run the benchmark evaluation. |
|
|
|
|
|
Args: |
|
|
agent (ChatAgent): Chat agent for generating answers. |
|
|
auto_retriever (AutoRetriever): Retriever for finding relevant |
|
|
contexts. |
|
|
|
|
|
Returns: |
|
|
Dict[str, Optional[float]]: Dictionary of evaluation metrics. |
|
|
""" |
|
|
|
|
|
def context_call(example): |
|
|
retrieved_info = auto_retriever.run_vector_retriever( |
|
|
query=example['question'], |
|
|
contents=example['documents'], |
|
|
top_k=1, |
|
|
return_detailed_info=True, |
|
|
similarity_threshold=0.5, |
|
|
) |
|
|
return [c['text'] for c in retrieved_info['Retrieved Context']] |
|
|
|
|
|
def answer_call(example: Dict[str, Any]) -> str: |
|
|
user_msg = str(example) |
|
|
assistant_response = agent.step(user_msg) |
|
|
return assistant_response.msg.content |
|
|
|
|
|
|
|
|
annotated_ds = annotate_dataset( |
|
|
self.dataset, context_call, answer_call |
|
|
) |
|
|
evaluated_ds = ragas_evaluate_dataset( |
|
|
annotated_ds, |
|
|
contexts_field_name="contexts", |
|
|
answer_field_name="answer", |
|
|
metrics_to_evaluate=["context_relevancy", "faithfulness"], |
|
|
) |
|
|
|
|
|
return ragas_calculate_metrics( |
|
|
evaluated_ds, |
|
|
pred_context_relevance_field="context_relevancy", |
|
|
pred_faithfulness_field="faithfulness", |
|
|
) |
|
|
|