Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Oct 20, 2024

Commit

cc5f321

verified ·

1 Parent(s): d389578

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

api.py +41 -9
artifact.py +7 -2
collections_operators.py +22 -4
dialog_operators.py +2 -2
formats.py +1 -0
generator_utils.py +2 -32
inference.py +376 -55
llm_as_judge.py +261 -62
loaders.py +14 -6
metric_utils.py +18 -9
metrics.py +206 -67
operators.py +79 -47
processors.py +77 -2
settings_utils.py +1 -0
split_utils.py +6 -1
splitters.py +4 -2
standard.py +6 -6
stream.py +4 -3
stream_operators.py +5 -3
string_operators.py +9 -0
struct_data_operators.py +194 -5
templates.py +1 -1
type_utils.py +3 -0
utils.py +84 -1
version.py +1 -1

api.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from functools import lru_cache
 from typing import Any, Dict, List, Optional, Union
-from datasets import DatasetDict
 from .artifact import fetch_artifact
 from .dataset_utils import get_dataset_artifact
 from .logging_utils import get_logger
 from .metric_utils import _compute, _inference_post_process
 from .operator import SourceOperator
@@ -14,7 +14,7 @@ from .standard import StandardRecipe
 logger = get_logger()
-def load(source: Union[SourceOperator, str]) -> DatasetDict:
     assert isinstance(
         source, (SourceOperator, str)
     ), "source must be a SourceOperator or a string"
@@ -79,7 +79,9 @@ def load_recipe(dataset_query: Optional[str] = None, **kwargs) -> StandardRecipe
     return recipe
-def load_dataset(dataset_query: Optional[str] = None, **kwargs) -> DatasetDict:
     """Loads dataset.
     If the 'dataset_query' argument is provided, then dataset is loaded from a card in local
@@ -90,6 +92,7 @@ def load_dataset(dataset_query: Optional[str] = None, **kwargs) -> DatasetDict:
         dataset_query (str, optional): A string query which specifies a dataset to load from local catalog or name of specific recipe or benchmark in the catalog.
             For example:
             "card=cards.wnli,template=templates.classification.multi_class.relation.default".
         **kwargs: Arguments used to load dataset from provided card, which is not present in local catalog.
     Returns:
@@ -107,6 +110,9 @@ def load_dataset(dataset_query: Optional[str] = None, **kwargs) -> DatasetDict:
     """
     recipe = load_recipe(dataset_query, **kwargs)
     return recipe().to_dataset(features=UNITXT_DATASET_SCHEMA)
@@ -135,19 +141,45 @@ def produce(instance_or_instances, dataset_query: Optional[str] = None, **kwargs
 def infer(
     instance_or_instances,
-    engine,
     dataset_query: Optional[str] = None,
-    return_data=False,
     **kwargs,
 ):
     dataset = produce(instance_or_instances, dataset_query, **kwargs)
     engine, _ = fetch_artifact(engine)
-    raw_predictions = engine.infer(dataset)
     predictions = post_process(raw_predictions, dataset)
     if return_data:
-        for prediction, raw_prediction, instance in zip(
-            predictions, raw_predictions, dataset
         ):
             instance["prediction"] = prediction
             instance["raw_prediction"] = raw_prediction
         return dataset

+import json
 from functools import lru_cache
 from typing import Any, Dict, List, Optional, Union
 from .artifact import fetch_artifact
 from .dataset_utils import get_dataset_artifact
+from .inference import InferenceEngine, LogProbInferenceEngine
 from .logging_utils import get_logger
 from .metric_utils import _compute, _inference_post_process
 from .operator import SourceOperator
 logger = get_logger()
+def load(source: Union[SourceOperator, str]):
     assert isinstance(
         source, (SourceOperator, str)
     ), "source must be a SourceOperator or a string"
     return recipe
+def load_dataset(
+    dataset_query: Optional[str] = None, streaming: bool = False, **kwargs
+):
     """Loads dataset.
     If the 'dataset_query' argument is provided, then dataset is loaded from a card in local
         dataset_query (str, optional): A string query which specifies a dataset to load from local catalog or name of specific recipe or benchmark in the catalog.
             For example:
             "card=cards.wnli,template=templates.classification.multi_class.relation.default".
+        streaming (bool, False): When True yields the data as Unitxt streams dictionary
         **kwargs: Arguments used to load dataset from provided card, which is not present in local catalog.
     Returns:
     """
     recipe = load_recipe(dataset_query, **kwargs)
+    if streaming:
+        return recipe()
     return recipe().to_dataset(features=UNITXT_DATASET_SCHEMA)
 def infer(
     instance_or_instances,
+    engine: InferenceEngine,
     dataset_query: Optional[str] = None,
+    return_data: bool = False,
+    return_log_probs: bool = False,
+    return_meta_data: bool = False,
     **kwargs,
 ):
     dataset = produce(instance_or_instances, dataset_query, **kwargs)
     engine, _ = fetch_artifact(engine)
+    if return_log_probs:
+        if not isinstance(engine, LogProbInferenceEngine):
+            raise NotImplementedError(
+                f"Error in infer: return_log_probs set to True but supplied engine "
+                f"{engine.__class__.__name__} does not support logprobs."
+            )
+        infer_outputs = engine.infer_log_probs(dataset, return_meta_data)
+        raw_predictions = (
+            [output.prediction for output in infer_outputs]
+            if return_meta_data
+            else infer_outputs
+        )
+        raw_predictions = [
+            json.dumps(raw_prediction) for raw_prediction in raw_predictions
+        ]
+    else:
+        infer_outputs = engine.infer(dataset, return_meta_data)
+        raw_predictions = (
+            [output.prediction for output in infer_outputs]
+            if return_meta_data
+            else infer_outputs
+        )
     predictions = post_process(raw_predictions, dataset)
     if return_data:
+        for prediction, raw_prediction, instance, infer_output in zip(
+            predictions, raw_predictions, dataset, infer_outputs
         ):
+            if return_meta_data:
+                instance["infer_meta_data"] = infer_output.__dict__
+                del instance["infer_meta_data"]["prediction"]
             instance["prediction"] = prediction
             instance["raw_prediction"] = raw_prediction
         return dataset

artifact.py CHANGED Viewed

@@ -22,7 +22,12 @@ from .parsing_utils import (
 from .settings_utils import get_constants, get_settings
 from .text_utils import camel_to_snake_case, is_camel_case
 from .type_utils import issubtype
-from .utils import artifacts_json_cache, deepcopy, json_dump, save_to_file
 logger = get_logger()
 settings = get_settings()
@@ -405,7 +410,7 @@ def get_raw(obj):
     if isinstance(obj, dict):
         return type(obj)({get_raw(k): get_raw(v) for k, v in obj.items()})
-    return deepcopy(obj)
 class ArtifactList(list, Artifact):

 from .settings_utils import get_constants, get_settings
 from .text_utils import camel_to_snake_case, is_camel_case
 from .type_utils import issubtype
+from .utils import (
+    artifacts_json_cache,
+    json_dump,
+    save_to_file,
+    shallow_copy,
+)
 logger = get_logger()
 settings = get_settings()
     if isinstance(obj, dict):
         return type(obj)({get_raw(k): get_raw(v) for k, v in obj.items()})
+    return shallow_copy(obj)
 class ArtifactList(list, Artifact):

collections_operators.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Generator, List, Optional
 from .dict_utils import dict_get, dict_set
 from .operators import FieldOperator, StreamOperator
 from .stream import Stream
-from .utils import deepcopy
 class Dictify(FieldOperator):
@@ -70,10 +70,10 @@ class DuplicateByList(StreamOperator):
             elements = dict_get(instance, self.field)
             for element in elements:
                 if self.use_deep_copy:
-                    instance_copy = deepcopy(instance)
                 else:
-                    instance_copy = {**instance}
                 dict_set(instance_copy, to_field, element)
                 yield instance_copy
@@ -93,7 +93,7 @@ class DuplicateBySubLists(StreamOperator):
             elements = instance[self.field]
             for i in range(1, len(elements) + 1):
                 if self.use_deep_copy:
-                    instance_copy = deepcopy(instance)
                     instance_copy[to_field] = elements[:i]
                 else:
                     instance_copy = {
@@ -107,3 +107,21 @@ class DuplicateBySubLists(StreamOperator):
 class GetLength(FieldOperator):
     def process_value(self, collection: Any) -> Any:
         return len(collection)

 from .dict_utils import dict_get, dict_set
 from .operators import FieldOperator, StreamOperator
 from .stream import Stream
+from .utils import recursive_shallow_copy
 class Dictify(FieldOperator):
             elements = dict_get(instance, self.field)
             for element in elements:
                 if self.use_deep_copy:
+                    instance_copy = recursive_shallow_copy(instance)
                 else:
+                    instance_copy = instance.copy()
                 dict_set(instance_copy, to_field, element)
                 yield instance_copy
             elements = instance[self.field]
             for i in range(1, len(elements) + 1):
                 if self.use_deep_copy:
+                    instance_copy = recursive_shallow_copy(instance)
                     instance_copy[to_field] = elements[:i]
                 else:
                     instance_copy = {
 class GetLength(FieldOperator):
     def process_value(self, collection: Any) -> Any:
         return len(collection)
+class Filter(FieldOperator):
+    values: List[Any]
+    def process_value(self, collection: Any) -> Any:
+        # If collection is a list, tuple, or set
+        if isinstance(collection, (list, set, tuple)):
+            return type(collection)(
+                item for item in collection if item not in self.values
+            )
+        # If collection is a dictionary, filter by keys
+        if isinstance(collection, dict):
+            return {k: v for k, v in collection.items() if k not in self.values}
+        # If collection is of an unsupported type
+        raise TypeError(f"Unsupported collection type: {type(collection)}")

dialog_operators.py CHANGED Viewed

@@ -157,13 +157,13 @@ class SerializeOpenAiFormatDialog(SerializeDialog):
                     f"Entry {i} has a non-string 'content': {entry['content']}. The 'content' value must be a string."
                 )
-            if entry["role"] not in {"user", "assistant"}:
                 raise ValueError(
                     f"Entry {i} has an invalid role: {entry['role']}. Allowed roles are 'user' and 'assistant'."
                 )
         first_entry = dialog[0]
-        if first_entry["role"] != "user":
             raise ValueError(
                 f"First entry role is expected to be 'user' It is  {first_entry['role']}."
             )

                     f"Entry {i} has a non-string 'content': {entry['content']}. The 'content' value must be a string."
                 )
+            if entry["role"].lower() not in {"user", "assistant"}:
                 raise ValueError(
                     f"Entry {i} has an invalid role: {entry['role']}. Allowed roles are 'user' and 'assistant'."
                 )
         first_entry = dialog[0]
+        if first_entry["role"].lower() != "user":
             raise ValueError(
                 f"First entry role is expected to be 'user' It is  {first_entry['role']}."
             )

formats.py CHANGED Viewed

@@ -182,6 +182,7 @@ class SystemFormat(BaseFormat):
                 target_prefix=demo_target_prefix,
                 source=demo_source,
                 target=demo_target,
                 **self.format_args,
             )
             demos_string += demo_str

                 target_prefix=demo_target_prefix,
                 source=demo_source,
                 target=demo_target,
+                instruction=instruction,
                 **self.format_args,
             )
             demos_string += demo_str

generator_utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import Any, Dict, List
 from .dataclass import Dataclass, OptionalField
-from .utils import deepcopy
 class ReusableGenerator(Dataclass):
@@ -22,34 +22,4 @@ class ReusableGenerator(Dataclass):
 class CopyingReusableGenerator(ReusableGenerator):
     def __iter__(self):
         for instance in self.activate():
-            yield deepcopy(instance)
-# if __name__ == "__main__":
-#     from itertools import chain, islice
-#     # Creating objects of MyIterable
-#     iterable1 = ReusableGenerator(range, gen_argv=[1, 4])
-#     iterable2 = ReusableGenerator(range, gen_argv=[4, 7])
-#     # Using itertools.chain
-#     chained = list(chain(iterable1, iterable2))
-#     logger.info(chained)  # Prints: [1, 2, 3, 4, 5, 6]
-#     # Using itertools.islice
-#     sliced = list(islice(ReusableGenerator(range, gen_argv=[1, 7]), 1, 4))
-#     logger.info(sliced)  # Prints: [2, 3, 4]
-#     # now same test with generators
-#     def generator(start, end):
-#         for i in range(start, end):
-#             yield i
-#     iterable1 = ReusableGenerator(generator, gen_argv=[1, 4])
-#     iterable2 = ReusableGenerator(generator, gen_argv=[4, 7])
-#     chained = list(chain(iterable1, iterable2))
-#     logger.info(chained)  # Prints: [1, 2, 3, 4, 5, 6]
-#     sliced = list(islice(ReusableGenerator(generator, gen_argv=[1, 7]), 1, 4))
-#     logger.info(sliced)  # Prints: [2, 3, 4]

 from typing import Any, Dict, List
 from .dataclass import Dataclass, OptionalField
+from .utils import recursive_shallow_copy
 class ReusableGenerator(Dataclass):
 class CopyingReusableGenerator(ReusableGenerator):
     def __iter__(self):
         for instance in self.activate():
+            yield recursive_shallow_copy(instance)

inference.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import abc
 import os
 import re
 from typing import Any, Dict, List, Literal, Optional, Union
 from tqdm import tqdm
 from .artifact import Artifact, fetch_artifact
@@ -16,12 +18,52 @@ from .settings_utils import get_settings
 settings = get_settings()
 class InferenceEngine(abc.ABC, Artifact):
     """Abstract base class for inference."""
     @abc.abstractmethod
-    def _infer(self, dataset):
-        """Perform inference on the input dataset."""
         pass
     @abc.abstractmethod
@@ -33,12 +75,29 @@ class InferenceEngine(abc.ABC, Artifact):
         if not settings.mock_inference_mode:
             self.prepare_engine()
-    def infer(self, dataset) -> str:
-        """Verifies instances of a dataset and performs inference."""
         [self.verify_instance(instance) for instance in dataset]
         if settings.mock_inference_mode:
             return [instance["source"] for instance in dataset]
-        return self._infer(dataset)
     @deprecation(version="2.0.0")
     def _set_inference_parameters(self):
@@ -62,19 +121,39 @@ class LogProbInferenceEngine(abc.ABC, Artifact):
     """Abstract base class for inference with log probs."""
     @abc.abstractmethod
-    def _infer_log_probs(self, dataset):
-        """Perform inference on the input dataset that returns log probs."""
         pass
-    def infer_log_probs(self, dataset) -> List[Dict]:
         """Verifies instances of a dataset and performs inference that returns log probabilities of top tokens.
-        For each instance , returns a list of top tokens per position.
         [ "top_tokens": [ { "text": ..., "logprob": ...} , ... ]
         """
         [self.verify_instance(instance) for instance in dataset]
-        return self._infer_log_probs(dataset)
 class LazyLoadMixin(Artifact):
@@ -96,6 +175,9 @@ class HFPipelineBasedInferenceEngine(
         "transformers": "Install huggingface package using 'pip install --upgrade transformers"
     }
     def _prepare_pipeline(self):
         import torch
         from transformers import AutoConfig, pipeline
@@ -143,7 +225,11 @@ class HFPipelineBasedInferenceEngine(
     def _is_loaded(self):
         return hasattr(self, "model") and self.model is not None
-    def _infer(self, dataset):
         if not self._is_loaded():
             self._prepare_pipeline()
@@ -157,12 +243,20 @@ class HFPipelineBasedInferenceEngine(
 class MockInferenceEngine(InferenceEngine):
     model_name: str
     def prepare_engine(self):
         return
-    def _infer(self, dataset):
-        return ["[[10]]" for instance in dataset]
 class MockModeMixin(Artifact):
@@ -226,7 +320,14 @@ class GenericInferenceEngine(InferenceEngine):
             engine_reference = self.default
         self.engine, _ = fetch_artifact(engine_reference)
-    def _infer(self, dataset):
         return self.engine._infer(dataset)
@@ -238,10 +339,17 @@ class OllamaInferenceEngine(InferenceEngine, PackageRequirementsMixin):
     }
     data_classification_policy = ["public", "proprietary"]
     def prepare_engine(self):
         pass
-    def _infer(self, dataset):
         import ollama
         result = [
@@ -260,7 +368,10 @@ class OllamaInferenceEngine(InferenceEngine, PackageRequirementsMixin):
 class IbmGenAiInferenceEngine(
-    InferenceEngine, IbmGenAiInferenceEngineParamsMixin, PackageRequirementsMixin
 ):
     label: str = "ibm_genai"
     model_name: str
@@ -270,6 +381,9 @@ class IbmGenAiInferenceEngine(
     data_classification_policy = ["public", "proprietary"]
     parameters: Optional[IbmGenAiInferenceEngineParams] = None
     def prepare_engine(self):
         from genai import Client, Credentials
@@ -285,21 +399,88 @@ class IbmGenAiInferenceEngine(
         self._set_inference_parameters()
-    def _infer(self, dataset):
         from genai.schema import TextGenerationParameters
         genai_params = TextGenerationParameters(
             **self.to_dict([IbmGenAiInferenceEngineParamsMixin])
         )
-        return [
-            response.results[0].generated_text
-            for response in self.client.text.generation.create(
-                model_id=self.model_name,
-                inputs=[instance["source"] for instance in dataset],
-                parameters=genai_params,
             )
-        ]
 class OpenAiInferenceEngineParamsMixin(Artifact):
@@ -349,18 +530,29 @@ class OpenAiInferenceEngine(
     data_classification_policy = ["public"]
     parameters: Optional[OpenAiInferenceEngineParams] = None
-    def prepare_engine(self):
-        from openai import OpenAI
-        api_key_env_var_name = "OPENAI_API_KEY"
-        api_key = os.environ.get(api_key_env_var_name)
         assert api_key is not None, (
-            f"Error while trying to run OpenAiInferenceEngine."
-            f" Please set the environment param '{api_key_env_var_name}'."
         )
-        self.client = OpenAI(api_key=api_key)
         self._set_inference_parameters()
     def _get_completion_kwargs(self):
@@ -370,7 +562,11 @@ class OpenAiInferenceEngine(
             if v is not None
         }
-    def _infer(self, dataset):
         outputs = []
         for instance in tqdm(dataset, desc="Inferring with openAI API"):
             response = self.client.chat.completions.create(
@@ -387,13 +583,18 @@ class OpenAiInferenceEngine(
                 model=self.model_name,
                 **self._get_completion_kwargs(),
             )
-            output = response.choices[0].message.content
             outputs.append(output)
         return outputs
-    def _infer_log_probs(self, dataset):
         outputs = []
         for instance in tqdm(dataset, desc="Inferring with openAI API"):
             response = self.client.chat.completions.create(
@@ -411,7 +612,7 @@ class OpenAiInferenceEngine(
                 **self._get_completion_kwargs(),
             )
             top_logprobs_response = response.choices[0].logprobs.content
-            output = [
                 {
                     "top_tokens": [
                         {"text": obj.token, "logprob": obj.logprob}
@@ -420,9 +621,21 @@ class OpenAiInferenceEngine(
                 }
                 for generated_token in top_logprobs_response
             ]
             outputs.append(output)
         return outputs
 class TogetherAiInferenceEngineParamsMixin(Artifact):
     max_tokens: Optional[int] = None
@@ -450,6 +663,9 @@ class TogetherAiInferenceEngine(
     data_classification_policy = ["public"]
     parameters: Optional[TogetherAiInferenceEngineParamsMixin] = None
     def prepare_engine(self):
         from together import Together
         from together.types.models import ModelType
@@ -501,7 +717,11 @@ class TogetherAiInferenceEngine(
         )
         return response.choices[0].text
-    def _infer(self, dataset):
         from together.types.models import ModelType
         outputs = []
@@ -514,6 +734,23 @@ class TogetherAiInferenceEngine(
         return outputs
 class WMLInferenceEngineParamsMixin(Artifact):
     decoding_method: Optional[Literal["greedy", "sample"]] = None
     length_penalty: Optional[Dict[str, Union[int, float]]] = None
@@ -550,7 +787,10 @@ class WMLInferenceEngineParams(Artifact):
 class WMLInferenceEngine(
-    InferenceEngine, WMLInferenceEngineParamsMixin, PackageRequirementsMixin
 ):
     """Runs inference using ibm-watsonx-ai.
@@ -604,14 +844,17 @@ class WMLInferenceEngine(
     concurrency_limit: int = 10
     _client: Any = InternalField(default=None, name="WML client")
     def verify(self):
         super().verify()
         if self.credentials is not None:
             for key in self.credentials:
-                if key not in ["url", "apikey", "project_id"]:
                     raise ValueError(
-                        f'Illegal credential key: {key}, use only ["url", "apikey", "project_id"]'
                     )
         assert (
@@ -631,10 +874,14 @@ class WMLInferenceEngine(
     @staticmethod
     def _read_wml_credentials_from_env() -> (
-        Dict[Literal["url", "apikey", "project_id"], str]
     ):
         credentials = {}
-        for env_var_name in ["WML_URL", "WML_PROJECT_ID", "WML_APIKEY"]:
             env_var = os.environ.get(env_var_name)
             assert env_var, (
                 f"Error while trying to run 'WMLInferenceEngine'. "
@@ -655,7 +902,10 @@ class WMLInferenceEngine(
             self.credentials = self._read_wml_credentials_from_env()
         client = APIClient(credentials=self.credentials)
-        client.set.default_project(self.credentials["project_id"])
         return client
     def prepare_engine(self):
@@ -663,7 +913,7 @@ class WMLInferenceEngine(
         self._set_inference_parameters()
-    def _infer(self, dataset):
         from ibm_watsonx_ai.foundation_models import ModelInference
         model = ModelInference(
@@ -671,20 +921,81 @@ class WMLInferenceEngine(
             deployment_id=self.deployment_id,
             api_client=self._client,
         )
-        # the class was previously used with a dataset that is a single instance
-        dataset = dataset if isinstance(dataset, list) else [dataset]
-        result = [
-            model.generate_text(
                 prompt=instance["source"],
                 params=self.to_dict([WMLInferenceEngineParamsMixin], keep_empty=False),
             )
-            for instance in dataset
-        ]
-        # the class was previously used with a dataset that is a single instance
-        return result[0] if not isinstance(dataset, list) else result
 class HFLlavaInferenceEngine(InferenceEngine, LazyLoadMixin):
@@ -698,6 +1009,9 @@ class HFLlavaInferenceEngine(InferenceEngine, LazyLoadMixin):
         "accelerate": "pip install accelerate",
     }
     def _prepare_engine(self):
         import torch
         from transformers import AutoProcessor, LlavaForConditionalGeneration
@@ -725,14 +1039,18 @@ class HFLlavaInferenceEngine(InferenceEngine, LazyLoadMixin):
     def _is_loaded(self):
         return hasattr(self, "model") and self.model is not None
-    def _infer(self, dataset):
         if not self._is_loaded():
             self._prepare_engine()
         import torch
         results = []
-        for instance in dataset:
             text = instance["source"]
             images = extract_images(text, instance)
             # Regular expression to match all <img src="..."> tags
@@ -745,7 +1063,10 @@ class HFLlavaInferenceEngine(InferenceEngine, LazyLoadMixin):
             ).to(self.device, torch.float16)
             input_len = len(inputs["input_ids"][0])
             output = self.model.generate(
-                **inputs, max_new_tokens=self.max_new_tokens, do_sample=False
             )
             result = self.processor.decode(
                 output[0][input_len:], skip_special_tokens=True

 import abc
+import dataclasses
 import os
 import re
 from typing import Any, Dict, List, Literal, Optional, Union
+from datasets import DatasetDict
 from tqdm import tqdm
 from .artifact import Artifact, fetch_artifact
 settings = get_settings()
+def get_model_and_label_id(model_name, label):
+    model_id = model_name.split("/")[-1].replace("-", "_").replace(".", ",").lower()
+    return f"{model_id}_{label}"
+@dataclasses.dataclass
+class TextGenerationInferenceOutput:
+    """Contains the prediction results and metadata for the inference.
+    Args:
+    prediction (Union[str, List[Dict[str, Any]]]): If this is the result of an _infer call, the string predicted by the model.
+    If this is the results of an _infer_log_probs call, a list of dictionaries. The i'th dictionary represents
+    the i'th token in the response. The entry "top_tokens" in the dictionary holds a sorted list of the top tokens
+    for this position and their probabilities.
+    For example: [ {.. "top_tokens": [ {"text": "a", 'logprob': },  {"text": "b", 'logprob': } ....]},
+                   {.. "top_tokens": [ {"text": "c", 'logprob': },  {"text": "d", 'logprob': } ....]}
+                ]
+    input_tokens (int) : number of input tokens to the model.
+    output_tokens (int) : number of output tokens to the model.
+    model_name (str): the model_name as kept in the InferenceEngine.
+    inference_type (str): The label stating the type of the InferenceEngine.
+    """
+    prediction: Union[str, List[Dict[str, Any]]]
+    input_tokens: Optional[int] = None
+    output_tokens: Optional[int] = None
+    model_name: Optional[str] = None
+    inference_type: Optional[str] = None
 class InferenceEngine(abc.ABC, Artifact):
     """Abstract base class for inference."""
     @abc.abstractmethod
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        """Perform inference on the input dataset.
+        If return_meta_data - returns a list of TextGenerationInferenceOutput, else returns a list of the string.
+        return_meta_data is only supported for some InferenceEngines.
+        predictions.
+        """
         pass
     @abc.abstractmethod
         if not settings.mock_inference_mode:
             self.prepare_engine()
+    def infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        """Verifies instances of a dataset and perform inference on the input dataset.
+        If return_meta_data - returns a list of TextGenerationInferenceOutput, else returns a list of the string
+        predictions.
+        """
+        if return_meta_data and not hasattr(self, "get_return_object"):
+            raise NotImplementedError(
+                f"Inference engine {self.__class__.__name__} does not support return_meta_data as it "
+                f"does not contain a 'get_return_object' method. Please set return_meta_data=False."
+            )
         [self.verify_instance(instance) for instance in dataset]
         if settings.mock_inference_mode:
             return [instance["source"] for instance in dataset]
+        return self._infer(dataset, return_meta_data)
+    def get_engine_id(self):
+        raise NotImplementedError()
     @deprecation(version="2.0.0")
     def _set_inference_parameters(self):
     """Abstract base class for inference with log probs."""
     @abc.abstractmethod
+    def _infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        """Perform inference on the input dataset  that returns log probs.
+        If return_meta_data - returns a list of TextGenerationInferenceOutput, else returns a list of the logprob dicts.
+        return_meta_data is only supported for some InferenceEngines.
+        predictions.
+        """
         pass
+    def infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
         """Verifies instances of a dataset and performs inference that returns log probabilities of top tokens.
+        For each instance , generates a list of top tokens per position.
         [ "top_tokens": [ { "text": ..., "logprob": ...} , ... ]
+        If return_meta_data - returns a list of TextGenerationInferenceOutput, else returns the list of the logprob dicts.
+        return_meta_data is only supported for some InferenceEngines.
         """
+        if return_meta_data and not hasattr(self, "get_return_object"):
+            raise NotImplementedError(
+                f"Inference engine {self.__class__.__name__} does not support return_meta_data as it "
+                f"does not contain a 'get_return_object' method. Please set return_meta_data=False."
+            )
         [self.verify_instance(instance) for instance in dataset]
+        return self._infer_log_probs(dataset, return_meta_data)
 class LazyLoadMixin(Artifact):
         "transformers": "Install huggingface package using 'pip install --upgrade transformers"
     }
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, "hf_pipeline")
     def _prepare_pipeline(self):
         import torch
         from transformers import AutoConfig, pipeline
     def _is_loaded(self):
         return hasattr(self, "model") and self.model is not None
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         if not self._is_loaded():
             self._prepare_pipeline()
 class MockInferenceEngine(InferenceEngine):
     model_name: str
+    default_inference_value: str = "[[10]]"
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, "mock")
     def prepare_engine(self):
         return
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        return [self.default_inference_value for instance in dataset]
 class MockModeMixin(Artifact):
             engine_reference = self.default
         self.engine, _ = fetch_artifact(engine_reference)
+    def get_engine_id(self):
+        return "generic_inference_engine"
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         return self.engine._infer(dataset)
     }
     data_classification_policy = ["public", "proprietary"]
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, self.label)
     def prepare_engine(self):
         pass
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         import ollama
         result = [
 class IbmGenAiInferenceEngine(
+    InferenceEngine,
+    IbmGenAiInferenceEngineParamsMixin,
+    PackageRequirementsMixin,
+    LogProbInferenceEngine,
 ):
     label: str = "ibm_genai"
     model_name: str
     data_classification_policy = ["public", "proprietary"]
     parameters: Optional[IbmGenAiInferenceEngineParams] = None
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, self.label)
     def prepare_engine(self):
         from genai import Client, Credentials
         self._set_inference_parameters()
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         from genai.schema import TextGenerationParameters
         genai_params = TextGenerationParameters(
             **self.to_dict([IbmGenAiInferenceEngineParamsMixin])
         )
+        results = []
+        responses = self.client.text.generation.create(
+            model_id=self.model_name,
+            inputs=[instance["source"] for instance in dataset],
+            parameters=genai_params,
+        )
+        for response in responses:
+            generated_text = response.results[0].generated_text
+            result = self.get_return_object(
+                generated_text, response.results[0], return_meta_data
             )
+            results.append(result)
+        return results
+    def _infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        from genai.schema import TextGenerationParameters
+        logprobs_return_options = {
+            "generated_tokens": True,
+            "input_text": False,
+            "input_tokens": False,
+            "token_logprobs": True,
+            "token_ranks": True,
+            "top_n_tokens": 5,
+        }
+        genai_params = self.to_dict(
+            [IbmGenAiInferenceEngineParamsMixin], keep_empty=False
+        )
+        genai_params = {**genai_params, "return_options": logprobs_return_options}
+        genai_params = TextGenerationParameters(**genai_params)
+        predictions = self.client.text.generation.create(
+            model_id=self.model_name,
+            inputs=[instance["source"] for instance in dataset],
+            parameters=genai_params,
+        )
+        predict_results = []
+        for prediction in predictions:
+            result = prediction.results[0]
+            assert isinstance(
+                result.generated_tokens, list
+            ), "result.generated_tokens should be a list"
+            predict_result = []
+            for base_token in result.generated_tokens:
+                res = {**base_token.__dict__, **base_token.model_extra}
+                res["top_tokens"] = [
+                    {"logprob": top_token.logprob, "text": top_token.text}
+                    for top_token in res["top_tokens"]
+                ]
+                predict_result.append(res)
+            final_results = self.get_return_object(
+                predict_result, result, return_meta_data
+            )
+            predict_results.append(final_results)
+        return predict_results
+    def get_return_object(self, predict_result, result, return_meta_data):
+        if return_meta_data:
+            return TextGenerationInferenceOutput(
+                prediction=predict_result,
+                input_tokens=result.input_token_count,
+                output_tokens=result.generated_token_count,
+                model_name=self.model_name,
+                inference_type=self.label,
+            )
+        return predict_result
 class OpenAiInferenceEngineParamsMixin(Artifact):
     data_classification_policy = ["public"]
     parameters: Optional[OpenAiInferenceEngineParams] = None
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, self.label)
+    @classmethod
+    def get_api_param(cls, inference_engine: str, api_param_env_var_name: str):
+        api_key = os.environ.get(api_param_env_var_name)
         assert api_key is not None, (
+            f"Error while trying to run {inference_engine}."
+            f" Please set the environment param '{api_param_env_var_name}'."
         )
+        return api_key
+    def create_client(self):
+        from openai import OpenAI
+        api_key = self.get_api_param(
+            inference_engine="OpenAiInferenceEngine",
+            api_param_env_var_name="OPENAI_API_KEY",
+        )
+        return OpenAI(api_key=api_key)
+    def prepare_engine(self):
+        self.client = self.create_client()
         self._set_inference_parameters()
     def _get_completion_kwargs(self):
             if v is not None
         }
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         outputs = []
         for instance in tqdm(dataset, desc="Inferring with openAI API"):
             response = self.client.chat.completions.create(
                 model=self.model_name,
                 **self._get_completion_kwargs(),
             )
+            prediction = response.choices[0].message.content
+            output = self.get_return_object(prediction, response, return_meta_data)
             outputs.append(output)
         return outputs
+    def _infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
         outputs = []
         for instance in tqdm(dataset, desc="Inferring with openAI API"):
             response = self.client.chat.completions.create(
                 **self._get_completion_kwargs(),
             )
             top_logprobs_response = response.choices[0].logprobs.content
+            pred_output = [
                 {
                     "top_tokens": [
                         {"text": obj.token, "logprob": obj.logprob}
                 }
                 for generated_token in top_logprobs_response
             ]
+            output = self.get_return_object(pred_output, response, return_meta_data)
             outputs.append(output)
         return outputs
+    def get_return_object(self, predict_result, response, return_meta_data):
+        if return_meta_data:
+            return TextGenerationInferenceOutput(
+                prediction=predict_result,
+                input_tokens=response.usage.prompt_tokens,
+                output_tokens=response.usage.completion_tokens,
+                model_name=self.model_name,
+                inference_type=self.label,
+            )
+        return predict_result
 class TogetherAiInferenceEngineParamsMixin(Artifact):
     max_tokens: Optional[int] = None
     data_classification_policy = ["public"]
     parameters: Optional[TogetherAiInferenceEngineParamsMixin] = None
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, self.label)
     def prepare_engine(self):
         from together import Together
         from together.types.models import ModelType
         )
         return response.choices[0].text
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         from together.types.models import ModelType
         outputs = []
         return outputs
+class VLLMRemoteInferenceEngine(OpenAiInferenceEngine):
+    label: str = "vllm"
+    def create_client(self):
+        from openai import OpenAI
+        api_key = self.get_api_param(
+            inference_engine="VLLMRemoteInferenceEngine",
+            api_param_env_var_name="VLLM_API_KEY",
+        )
+        api_url = self.get_api_param(
+            inference_engine="VLLMRemoteInferenceEngine",
+            api_param_env_var_name="VLLM_API_URL",
+        )
+        return OpenAI(api_key=api_key, base_url=api_url)
 class WMLInferenceEngineParamsMixin(Artifact):
     decoding_method: Optional[Literal["greedy", "sample"]] = None
     length_penalty: Optional[Dict[str, Union[int, float]]] = None
 class WMLInferenceEngine(
+    InferenceEngine,
+    WMLInferenceEngineParamsMixin,
+    PackageRequirementsMixin,
+    LogProbInferenceEngine,
 ):
     """Runs inference using ibm-watsonx-ai.
     concurrency_limit: int = 10
     _client: Any = InternalField(default=None, name="WML client")
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, self.label)
     def verify(self):
         super().verify()
         if self.credentials is not None:
             for key in self.credentials:
+                if key not in ["url", "apikey", "project_id", "space_id"]:
                     raise ValueError(
+                        f'Illegal credential key: {key}, use only ["url", "apikey", "project_id", "space_id"]'
                     )
         assert (
     @staticmethod
     def _read_wml_credentials_from_env() -> (
+        Dict[Literal["url", "apikey", "project_id", "space_id"], str]
     ):
         credentials = {}
+        project_or_deployment_var_name = (
+            "WML_SPACE_ID" if "WML_SPACE_ID" in os.environ else "WML_PROJECT_ID"
+        )
+        for env_var_name in ["WML_URL", project_or_deployment_var_name, "WML_APIKEY"]:
             env_var = os.environ.get(env_var_name)
             assert env_var, (
                 f"Error while trying to run 'WMLInferenceEngine'. "
             self.credentials = self._read_wml_credentials_from_env()
         client = APIClient(credentials=self.credentials)
+        if "space_id" in self.credentials:
+            client.set.default_space(self.credentials["space_id"])
+        else:
+            client.set.default_project(self.credentials["project_id"])
         return client
     def prepare_engine(self):
         self._set_inference_parameters()
+    def _load_model_and_params(self):
         from ibm_watsonx_ai.foundation_models import ModelInference
         model = ModelInference(
             deployment_id=self.deployment_id,
             api_client=self._client,
         )
+        params = self.to_dict([WMLInferenceEngineParamsMixin], keep_empty=False)
+        return model, params
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        model, params = self._load_model_and_params()
+        result = []
+        for instance in dataset:
+            instance_result = model.generate(
                 prompt=instance["source"],
                 params=self.to_dict([WMLInferenceEngineParamsMixin], keep_empty=False),
             )
+            prediction = instance_result["results"][0]["generated_text"]
+            instance_final_results = self.get_return_object(
+                prediction, instance_result, return_meta_data
+            )
+            result.append(instance_final_results)
+        return result
+    def _infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        model, params = self._load_model_and_params()
+        user_return_options = params.pop("return_options", {})
+        # currently this is the only configuration that returns generated logprobs and behaves as expected
+        logprobs_return_options = {
+            "input_tokens": True,
+            "generated_tokens": True,
+            "token_logprobs": True,
+            "top_n_tokens": user_return_options.get("top_n_tokens", 5),
+        }
+        for key, value in logprobs_return_options.items():
+            if key in user_return_options and user_return_options[key] != value:
+                raise ValueError(
+                    f"'{key}={user_return_options[key]}' is not supported for the 'infer_log_probs' "
+                    f"method of {self.__class__.__name__}. For obtaining the logprobs of generated tokens "
+                    f"please use '{key}={value}'."
+                )
+        params = {
+            **params,
+            "return_options": logprobs_return_options,
+        }
+        results = model.generate(
+            prompt=[instance["source"] for instance in dataset],
+            params=params,
+        )
+        final_results = []
+        for result in results:
+            generated_tokens = result["results"][0]["generated_tokens"]
+            final_results.append(
+                self.get_return_object(generated_tokens, result, return_meta_data)
+            )
+        return final_results
+    def get_return_object(self, predict_result, result, return_meta_data):
+        if return_meta_data:
+            return TextGenerationInferenceOutput(
+                prediction=predict_result,
+                input_tokens=result["results"][0]["input_token_count"],
+                output_tokens=result["results"][0]["generated_token_count"],
+                model_name=self.model_name,
+                inference_type=self.label,
+            )
+        return predict_result
 class HFLlavaInferenceEngine(InferenceEngine, LazyLoadMixin):
         "accelerate": "pip install accelerate",
     }
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model_name, "hf_lava")
     def _prepare_engine(self):
         import torch
         from transformers import AutoProcessor, LlavaForConditionalGeneration
     def _is_loaded(self):
         return hasattr(self, "model") and self.model is not None
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         if not self._is_loaded():
             self._prepare_engine()
         import torch
         results = []
+        for instance in tqdm(dataset):
             text = instance["source"]
             images = extract_images(text, instance)
             # Regular expression to match all <img src="..."> tags
             ).to(self.device, torch.float16)
             input_len = len(inputs["input_ids"][0])
             output = self.model.generate(
+                **inputs,
+                max_new_tokens=self.max_new_tokens,
+                do_sample=False,
+                pad_token_id=self.processor.tokenizer.eos_token_id,
             )
             result = self.processor.decode(
                 output[0][input_len:], skip_special_tokens=True

llm_as_judge.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from typing import Any, Dict, List, Literal, Optional
 from .api import infer
 from .artifact import fetch_artifact
 from .dataclass import Field
 from .formats import Format, SystemFormat
-from .inference import InferenceEngine, OpenAiInferenceEngine
 from .metrics import BulkInstanceMetric
 from .operator import SequentialOperator
 from .settings_utils import get_settings
@@ -14,38 +15,142 @@ from .templates import Template
 settings = get_settings()
-class LLMAsJudge(BulkInstanceMetric):
-    """LLM-as-judge-based metric class for evaluating correctness.
     Attributes:
         main_score (str): The main score label used for evaluation.
-        task (Literal["rating.single_turn"]): The type of task the llm as judge runs. This defines the output and input
          format of the judge model.
         template (Template): The template used when generating inputs for the judge llm.
         format (Format): The format used when generating inputs for judge llm.
         system_prompt (SystemPrompt): The system prompt used when generating inputs for judge llm.
-        strip_system_prompt_and_format_from_inputs (bool): Whether to strip the system prompt and formatting from the
-         inputs that the models that is being judges received, when they are inserted to the llm-as-judge prompt.
         inference_model (InferenceEngine): The module that creates the inference of the judge llm.
         reduction_map (dict): A dictionary specifying the reduction method for the metric.
         batch_size (int): The size of the bulk.
     """
     main_score: str = "llm_as_judge"
-    task: Literal[
-        "rating.single_turn",
-        "rating.single_turn_with_reference",
-        "pairwise_comparative_rating.single_turn",
-    ]
     template: Template
     system_prompt: SystemPrompt = Field(default_factory=EmptySystemPrompt)
     format: Format = Field(default_factory=SystemFormat)
-    strip_system_prompt_and_format_from_inputs: bool = True
     inference_model: InferenceEngine
     reduction_map: Optional[Dict[str, List[str]]] = None
     batch_size: int = 32
     prediction_type = Any  # Because handled with multiple tasks
     def _get_input_instances(self, task_data: List[Dict]) -> List:
         if self.strip_system_prompt_and_format_from_inputs:
             instances = []
@@ -119,6 +224,7 @@ class LLMAsJudge(BulkInstanceMetric):
             self.reduction_map = {"mean": [self.main_score]}
     def verify(self):
         supported_tasks = [
             "rating.single_turn",
             "rating.single_turn_with_reference",
@@ -129,68 +235,25 @@ class LLMAsJudge(BulkInstanceMetric):
             f"The supported tasks types are: {', '.join(supported_tasks)}."
         )
-        if not isinstance(self.template, Template):
-            raise ValueError(
-                f"Provided template argument to 'LLMAsJudge' metric is not of type Template, but {type(self.template)}"
-            )
-        if self.format and not isinstance(self.format, Format):
-            raise ValueError(
-                f"Provided format argument to 'LLMAsJudge' metric is not of type Format, but {type(self.format)}"
-            )
-        if self.system_prompt and not isinstance(self.system_prompt, SystemPrompt):
-            raise ValueError(
-                f"Provided system_prompt argument to 'LLMAsJudge' metric is not of type SystemPrompt, but {type(self.system_prompt)}"
-            )
-        if isinstance(self.inference_model, OpenAiInferenceEngine):
-            if self.format and type(self.format) is not SystemFormat:
-                raise ValueError(
-                    "Error in 'LLMAsJudge' metric. Inference model 'OpenAiInferenceEngine' does "
-                    "not support formatting. Please remove the format definition from the recipe"
-                    " (OpenAi Chat API take care of the formatting automatically)."
-                )
-            if self.system_prompt and type(self.system_prompt) is not EmptySystemPrompt:
-                raise ValueError(
-                    "Error in 'LLMAsJudge' metric. Inference model 'OpenAiInferenceEngine' does "
-                    "not support system prompt. Please remove the system_prompt definition from the recipe"
-                    " (Current implementation of Unitxt does not support this."
-                    " Support will be added in future updates)."
-                )
-    def compute(
-        self,
-        references: List[List[Any]],
-        predictions: List[Any],
-        task_data: List[Dict],
-    ) -> List[Dict[str, Any]]:
-        input_instances = self._get_input_instances(task_data)
-        instances = self._get_instance_for_judge_model(
-            input_instances, predictions, references
-        )
-        outputs = infer(
             instances,
             engine=self.inference_model,
-            task=f"tasks.response_assessment.{self.task}",
             template=self.template,
             system_prompt=self.system_prompt,
             format=self.format,
             return_data=True,
         )
         results = []
         for instance in outputs:
             if self.task == "pairwise_comparative_rating.single_turn":
-                import json
-                # seems like the task data sometimes comes as a string, not a dict
-                # this fixes it
-                task_data = (
-                    json.loads(instance["task_data"])
-                    if isinstance(instance["task_data"], str)
-                    else instance["task_data"]
-                )
                 is_model_b_the_baseline = task_data["model_b"] == "baseline_model"
                 if is_model_b_the_baseline:
                     model_a_preference_score = instance["prediction"]
@@ -209,5 +272,141 @@ class LLMAsJudge(BulkInstanceMetric):
                     "judge_raw_input": instance["source"],
                 }
             results.append(result)
         return results

+from abc import abstractmethod
 from typing import Any, Dict, List, Literal, Optional
 from .api import infer
 from .artifact import fetch_artifact
 from .dataclass import Field
 from .formats import Format, SystemFormat
+from .inference import InferenceEngine, LogProbInferenceEngine, OpenAiInferenceEngine
 from .metrics import BulkInstanceMetric
 from .operator import SequentialOperator
 from .settings_utils import get_settings
 settings = get_settings()
+def get_task_data_dict(task_data):
+    import json
+    # seems like the task data sometimes comes as a string, not a dict
+    # this fixes it
+    return json.loads(task_data) if isinstance(task_data, str) else task_data
+class LLMAsJudgeBase(BulkInstanceMetric):
+    """LLM-as-judge-base metric class for evaluating correctness of generated predictions.
     Attributes:
         main_score (str): The main score label used for evaluation.
+        task (str): The type of task the llm as judge runs. This defines the output and input
          format of the judge model.
         template (Template): The template used when generating inputs for the judge llm.
         format (Format): The format used when generating inputs for judge llm.
         system_prompt (SystemPrompt): The system prompt used when generating inputs for judge llm.
         inference_model (InferenceEngine): The module that creates the inference of the judge llm.
         reduction_map (dict): A dictionary specifying the reduction method for the metric.
         batch_size (int): The size of the bulk.
     """
     main_score: str = "llm_as_judge"
+    task: str
     template: Template
     system_prompt: SystemPrompt = Field(default_factory=EmptySystemPrompt)
     format: Format = Field(default_factory=SystemFormat)
     inference_model: InferenceEngine
     reduction_map: Optional[Dict[str, List[str]]] = None
     batch_size: int = 32
     prediction_type = Any  # Because handled with multiple tasks
+    def verify(self):
+        if not isinstance(self.template, Template):
+            raise ValueError(
+                f"Provided template argument to 'LLMAsJudge' metric is not of type Template, but {type(self.template)}"
+            )
+        if self.format and not isinstance(self.format, Format):
+            raise ValueError(
+                f"Provided format argument to 'LLMAsJudge' metric is not of type Format, but {type(self.format)}"
+            )
+        if self.system_prompt and not isinstance(self.system_prompt, SystemPrompt):
+            raise ValueError(
+                f"Provided system_prompt argument to 'LLMAsJudge' metric is not of type SystemPrompt, but {type(self.system_prompt)}"
+            )
+        if isinstance(self.inference_model, OpenAiInferenceEngine):
+            if self.format and type(self.format) is not SystemFormat:
+                raise ValueError(
+                    "Error in 'LLMAsJudge' metric. Inference model 'OpenAiInferenceEngine' does "
+                    "not support formatting. Please remove the format definition from the recipe"
+                    " (OpenAi Chat API take care of the formatting automatically)."
+                )
+            if self.system_prompt and type(self.system_prompt) is not EmptySystemPrompt:
+                raise ValueError(
+                    "Error in 'LLMAsJudge' metric. Inference model 'OpenAiInferenceEngine' does "
+                    "not support system prompt. Please remove the system_prompt definition from the recipe"
+                    " (Current implementation of Unitxt does not support this."
+                    " Support will be added in future updates)."
+                )
+    @abstractmethod
+    def get_full_task_name(self):
+        pass
+    def compute(
+        self,
+        references: List[List[Any]],
+        predictions: List[Any],
+        task_data: List[Dict],
+    ) -> List[Dict[str, Any]]:
+        instances = self.prepare_instances(references, predictions, task_data)
+        outputs = self.infer_instances(instances)
+        return self.get_metric_results_from_prediction_outputs(outputs)
+    @abstractmethod
+    def prepare_instances(
+        self, references, predictions, task_data
+    ) -> List[Dict[str, Any]]:
+        """Generate a list of instances for inference.
+        Each generated instance should include all the fields required by the metrics' task and template, to
+        create the source prompt for the judge.
+        """
+        pass
+    @abstractmethod
+    def infer_instances(self, instances: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Generate the dataset and call the inference engine to generate the judges' predictions.
+        Return the list of the produced instances with their generated judge predictions.
+        """
+        pass
+    @abstractmethod
+    def get_metric_results_from_prediction_outputs(
+        self, outputs: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """Generate a scores' dictionary for each instance.
+        Return the list of scores dictionaries for the input instances.
+        """
+        pass
+class LLMAsJudge(LLMAsJudgeBase):
+    """LLM-as-judge-based metric class for evaluating correctness of generated predictions.
+    This class uses the source prompt given to the generator and the generator's predictions to evaluate
+    correctness using one of three supported tasks (rating.single_turn, rating.single_turn_with_reference,
+    pairwise_comparative_rating.single_turn).
+    Attributes:
+        main_score (str): The main score label used for evaluation.
+        task (Literal["rating.single_turn","rating.single_turn_with_reference",
+        "pairwise_comparative_rating.single_turn"]): The type of task the llm as judge runs.
+         This defines the output and input format of the judge model.
+        template (Template): The template used when generating inputs for the judge llm.
+        format (Format): The format used when generating inputs for judge llm.
+        system_prompt (SystemPrompt): The system prompt used when generating inputs for judge llm.
+        strip_system_prompt_and_format_from_inputs (bool): Whether to strip the system prompt and formatting from the
+         inputs that the models that is being judges received, when they are inserted to the llm-as-judge prompt.
+        inference_model (InferenceEngine): The module that creates the inference of the judge llm.
+        reduction_map (dict): A dictionary specifying the reduction method for the metric.
+        batch_size (int): The size of the bulk.
+    """
+    task: Literal[
+        "rating.single_turn",
+        "rating.single_turn_with_reference",
+        "pairwise_comparative_rating.single_turn",
+    ]
+    strip_system_prompt_and_format_from_inputs: bool = True
     def _get_input_instances(self, task_data: List[Dict]) -> List:
         if self.strip_system_prompt_and_format_from_inputs:
             instances = []
             self.reduction_map = {"mean": [self.main_score]}
     def verify(self):
+        super().verify()
         supported_tasks = [
             "rating.single_turn",
             "rating.single_turn_with_reference",
             f"The supported tasks types are: {', '.join(supported_tasks)}."
         )
+    def get_full_task_name(self):
+        return f"tasks.response_assessment.{self.task}"
+    def infer_instances(self, instances):
+        return infer(
             instances,
             engine=self.inference_model,
+            task=self.get_full_task_name(),
             template=self.template,
             system_prompt=self.system_prompt,
             format=self.format,
             return_data=True,
         )
+    def get_metric_results_from_prediction_outputs(self, outputs):
         results = []
         for instance in outputs:
             if self.task == "pairwise_comparative_rating.single_turn":
+                task_data = get_task_data_dict(instance["task_data"])
                 is_model_b_the_baseline = task_data["model_b"] == "baseline_model"
                 if is_model_b_the_baseline:
                     model_a_preference_score = instance["prediction"]
                     "judge_raw_input": instance["source"],
                 }
             results.append(result)
+        return results
+    def prepare_instances(self, references, predictions, task_data):
+        input_instances = self._get_input_instances(task_data)
+        return self._get_instance_for_judge_model(
+            input_instances, predictions, references
+        )
+class TaskBasedLLMasJudge(LLMAsJudgeBase):
+    """LLM-as-judge-based metric class for evaluating correctness of generated predictions.
+    This class can use any task and matching template to evaluate the predictions. All
+    task/templates field are taken from the instance's task_data.
+    The instances sent to the judge can either be: 1.a unitxt dataset, in which case the predictions are
+    copied to a specified field of the task. 2. dictionaries with the fields required by the task and template.
+    Attributes:
+        main_score (str): The main score label used for evaluation.
+        task (str): The type of task the llm as judge runs.
+        This defines the output and input format of the judge model.
+        template (Template): The template used when generating inputs for the judge llm.
+        format (Format): The format used when generating inputs for judge llm.
+        system_prompt (SystemPrompt): The system prompt used when generating inputs for judge llm.
+        strip_system_prompt_and_format_from_inputs (bool): Whether to strip the system prompt and formatting from the
+         inputs that the models that is being judges received, when they are inserted to the llm-as-judge prompt.
+        inference_model (InferenceEngine): The module that creates the inference of the judge llm.
+        reduction_map (dict): A dictionary specifying the reduction method for the metric.
+        batch_size (int): The size of the bulk.
+        infer_log_probs(bool): whether to perform the inference using logprobs. If true, the template's
+        post-processing must support the logprobs output.
+        judge_to_generator_fields_mapping (Dict[str, str]): optional mapping between the names of the fields in the generator task and the
+        judge task. For example, if the generator task uses "reference_answers" and the judge task  expect "ground_truth",
+        include  {"ground_truth": "reference_answers"} in this dictionary.
+        prediction_field: if indicated, and prediction exist, copy prediction to this field name in task_data.
+        include_meta_data (bool): whether to include the inference per-instance metadata in the returned results.
+    """
+    infer_log_probs: bool = False
+    judge_to_generator_fields_mapping: Dict[str, str] = {}
+    prediction_field: Optional[str] = None
+    include_meta_data: bool = True
+    # Allow for input which is a dictionary of all input fields. In this case, all input fields are
+    # treated as the task data, and the predictions and references are taken directly from there
+    # by the judge's template
+    def preprocess_instance(self, instance):
+        if "task_data" not in instance:
+            instance["task_data"] = instance.copy()
+        if "prediction" not in instance:
+            instance["prediction"] = None
+        if "references" not in instance:
+            instance["references"] = [""]
+        return instance
+    def verify(self):
+        super().verify()
+        if self.infer_log_probs and not isinstance(
+            self.inference_model, LogProbInferenceEngine
+        ):
+            raise NotImplementedError(
+                f"Error in TaskBasedLLMasJudge: return_log_probs set to True but supplied engine "
+                f"{self.inference_model.__class__.__name__} does not support logprobs."
+            )
+        if self.include_meta_data and not hasattr(
+            self.inference_model, "get_return_object"
+        ):
+            Warning(
+                f"Supplied inference engine {self.inference_model.__class__.__name__} does not support "
+                "return_meta_data. Setting return_meta_data to False. Metadata scores will not appear "
+                "in returned instances scores."
+            )
+            self.include_meta_data = False
+    def prepare(self):
+        super().prepare()
+        self.reduction_map = {"mean": [self.main_score]}
+        self.score_prefix = f"{self.inference_model.get_engine_id()}_"
+    def get_full_task_name(self):
+        return self.task
+    def get_metric_results_from_prediction_outputs(self, outputs):
+        results = []
+        for instance in outputs:
+            result = {
+                self.main_score: instance["prediction"],
+                f"{self.main_score}_judge_raw_output": instance["raw_prediction"],
+                f"{self.main_score}_judge_raw_input": instance["source"],
+            }
+            if self.include_meta_data:
+                meta_data = {
+                    f"{self.main_score}_{k}": v
+                    for k, v in instance["infer_meta_data"].items()
+                }
+                result.update(meta_data)
+            results.append(result)
         return results
+    def prepare_instances(self, references, predictions, task_data):
+        from . import get_from_catalog
+        instances = []
+        judge_task = get_from_catalog(self.get_full_task_name())
+        judge_task_input_fields = judge_task.input_fields
+        for input_instance, prediction, _ in zip(task_data, predictions, references):
+            input_instance = get_task_data_dict(input_instance)
+            instance_task_data = {}
+            for judge_task_input_field in judge_task_input_fields:
+                orig_task_field_name = self.judge_to_generator_fields_mapping.get(
+                    judge_task_input_field, judge_task_input_field
+                )
+                new_val = input_instance.get(orig_task_field_name)
+                if new_val:
+                    instance_task_data[judge_task_input_field] = new_val
+            if self.prediction_field and prediction:
+                instance_task_data[self.prediction_field] = str(prediction)
+            instance_task_data = judge_task.process(instance_task_data)["input_fields"]
+            instances.append(instance_task_data)
+        return instances
+    def infer_instances(self, instances):
+        return infer(
+            instances,
+            engine=self.inference_model,
+            task=self.get_full_task_name(),
+            template=self.template,
+            system_prompt=self.system_prompt,
+            format=self.format,
+            return_data=True,
+            return_log_probs=self.infer_log_probs,
+            return_meta_data=self.include_meta_data,
+        )

loaders.py CHANGED Viewed

@@ -53,7 +53,7 @@ from .operators import Set
 from .settings_utils import get_settings
 from .stream import DynamicStream, MultiStream
 from .type_utils import isoftype
-from .utils import deepcopy
 logger = get_logger()
 settings = get_settings()
@@ -195,6 +195,10 @@ class LoadHF(Loader):
     def stream_dataset(self):
         if self._cache is None:
             with tempfile.TemporaryDirectory() as dir_to_be_deleted:
                 try:
                     dataset = hf_load_dataset(
                         self.path,
@@ -203,7 +207,7 @@ class LoadHF(Loader):
                         data_files=self.data_files,
                         revision=self.revision,
                         streaming=self.streaming,
-                        cache_dir=None if self.streaming else dir_to_be_deleted,
                         split=self.split,
                         trust_remote_code=settings.allow_unverified_code,
                         num_proc=self.num_proc,
@@ -231,6 +235,10 @@ class LoadHF(Loader):
     def load_dataset(self):
         if self._cache is None:
             with tempfile.TemporaryDirectory() as dir_to_be_deleted:
                 try:
                     dataset = hf_load_dataset(
                         self.path,
@@ -239,7 +247,7 @@ class LoadHF(Loader):
                         data_files=self.data_files,
                         streaming=False,
                         keep_in_memory=True,
-                        cache_dir=dir_to_be_deleted,
                         split=self.split,
                         trust_remote_code=settings.allow_unverified_code,
                         num_proc=self.num_proc,
@@ -664,7 +672,7 @@ class MultipleSourceLoader(Loader):
         .. code-block:: python
-            MultipleSourceLoader(loaders = [ LoadHF(path="public/data",split="train"), LoadCSV({"test": "mytest.csv"}) ])
@@ -672,7 +680,7 @@ class MultipleSourceLoader(Loader):
         .. code-block:: python
-            MultipleSourceLoader(loaders = [ LoadCSV({"test": "mytest1.csv"}, LoadCSV({"test": "mytest2.csv"}) ])
     """
     sources: List[Loader]
@@ -737,7 +745,7 @@ class LoadFromDictionary(Loader):
         self.sef_default_data_classification(
             ["proprietary"], "when loading from python dictionary"
         )
-        return MultiStream.from_iterables(deepcopy(self.data))
 class LoadFromHFSpace(LoadHF):

 from .settings_utils import get_settings
 from .stream import DynamicStream, MultiStream
 from .type_utils import isoftype
+from .utils import recursive_copy
 logger = get_logger()
 settings = get_settings()
     def stream_dataset(self):
         if self._cache is None:
             with tempfile.TemporaryDirectory() as dir_to_be_deleted:
+                if settings.disable_hf_datasets_cache and not self.streaming:
+                    cache_dir = dir_to_be_deleted
+                else:
+                    cache_dir = None
                 try:
                     dataset = hf_load_dataset(
                         self.path,
                         data_files=self.data_files,
                         revision=self.revision,
                         streaming=self.streaming,
+                        cache_dir=cache_dir,
                         split=self.split,
                         trust_remote_code=settings.allow_unverified_code,
                         num_proc=self.num_proc,
     def load_dataset(self):
         if self._cache is None:
             with tempfile.TemporaryDirectory() as dir_to_be_deleted:
+                if settings.disable_hf_datasets_cache:
+                    cache_dir = dir_to_be_deleted
+                else:
+                    cache_dir = None
                 try:
                     dataset = hf_load_dataset(
                         self.path,
                         data_files=self.data_files,
                         streaming=False,
                         keep_in_memory=True,
+                        cache_dir=cache_dir,
                         split=self.split,
                         trust_remote_code=settings.allow_unverified_code,
                         num_proc=self.num_proc,
         .. code-block:: python
+            MultipleSourceLoader(sources = [ LoadHF(path="public/data",split="train"), LoadCSV({"test": "mytest.csv"}) ])
         .. code-block:: python
+            MultipleSourceLoader(sources = [ LoadCSV({"test": "mytest1.csv"}, LoadCSV({"test": "mytest2.csv"}) ])
     """
     sources: List[Loader]
         self.sef_default_data_classification(
             ["proprietary"], "when loading from python dictionary"
         )
+        return MultiStream.from_iterables(recursive_copy(self.data))
 class LoadFromHFSpace(LoadHF):

metric_utils.py CHANGED Viewed

@@ -16,8 +16,8 @@ from .operator import (
 from .operators import (
     ApplyMetric,
     ApplyOperatorsField,
-    Copy,
     FlattenInstances,
     Rename,
 )
 from .register import _reset_env_local_catalogs, register_all_artifacts
@@ -25,7 +25,7 @@ from .schema import UNITXT_DATASET_SCHEMA
 from .settings_utils import get_constants, get_settings
 from .stream import DynamicStream, MultiStream
 from .struct_data_operators import LoadJson
-from .utils import deepcopy
 constants = get_constants()
@@ -54,27 +54,27 @@ class FromPredictionsAndOriginalData(StreamInitializerOperator):
 _post_process_steps = SequentialOperator(
     steps=[
-        Copy(
             field="prediction",
             to_field="raw_prediction",
         ),
-        Copy(
             field="references",
             to_field="raw_references",
             dont_apply_to_streams=[constants.inference_stream],
         ),
-        Copy(
             field="source",
             to_field="task_data/source",
         ),
         ApplyOperatorsField(
             operators_field="postprocessors",
         ),
-        Copy(
             field="prediction",
             to_field="processed_prediction",
         ),
-        Copy(
             field="references",
             to_field="processed_references",
             dont_apply_to_streams=[constants.inference_stream],
@@ -213,14 +213,19 @@ class JoinSubsetsAndGroups(MultiStreamOperator):
                 result = {}
                 all_scores = []
                 for k, v in dic.items():
                     score = recursive_mean(v)
                     if score is not None:
                         all_scores.append(score["score"])
                         result[k] = score
                 result["score"] = nan_mean(all_scores)
                 result["score_name"] = "subsets_mean"
                 if result:
                     return result
@@ -237,11 +242,15 @@ class JoinSubsetsAndGroups(MultiStreamOperator):
                     "score": score["subsets"]["score"],
                     "score_name": score["subsets"]["score_name"],
                 }
             sorted_instances = []
             for key in sorted(stream_instances.keys()):
                 instance = stream_instances[key]
-                instance["score"].update(deepcopy(score))
                 sorted_instances.append(instance)
             result[stream_name] = sorted_instances
@@ -299,7 +308,7 @@ class MetricRecipe(SequentialOperatorInitializer):
                 field="raw_references",
                 to_field="references",
             ),
-            Copy(
                 field="source",
                 to_field="task_data/source",
             ),

 from .operators import (
     ApplyMetric,
     ApplyOperatorsField,
     FlattenInstances,
+    RecursiveCopy,
     Rename,
 )
 from .register import _reset_env_local_catalogs, register_all_artifacts
 from .settings_utils import get_constants, get_settings
 from .stream import DynamicStream, MultiStream
 from .struct_data_operators import LoadJson
+from .utils import recursive_shallow_copy
 constants = get_constants()
 _post_process_steps = SequentialOperator(
     steps=[
+        RecursiveCopy(
             field="prediction",
             to_field="raw_prediction",
         ),
+        RecursiveCopy(
             field="references",
             to_field="raw_references",
             dont_apply_to_streams=[constants.inference_stream],
         ),
+        RecursiveCopy(
             field="source",
             to_field="task_data/source",
         ),
         ApplyOperatorsField(
             operators_field="postprocessors",
         ),
+        RecursiveCopy(
             field="prediction",
             to_field="processed_prediction",
         ),
+        RecursiveCopy(
             field="references",
             to_field="processed_references",
             dont_apply_to_streams=[constants.inference_stream],
                 result = {}
                 all_scores = []
+                all_num_of_instances = []
                 for k, v in dic.items():
                     score = recursive_mean(v)
                     if score is not None:
                         all_scores.append(score["score"])
+                        if "num_of_instances" in score:
+                            all_num_of_instances.append(score["num_of_instances"])
                         result[k] = score
                 result["score"] = nan_mean(all_scores)
                 result["score_name"] = "subsets_mean"
+                if all_num_of_instances:
+                    result["num_of_instances"] = sum(all_num_of_instances)
                 if result:
                     return result
                     "score": score["subsets"]["score"],
                     "score_name": score["subsets"]["score_name"],
                 }
+                if "num_of_instances" in score["subsets"]:
+                    score["global"]["num_of_instances"] = score["subsets"][
+                        "num_of_instances"
+                    ]
             sorted_instances = []
             for key in sorted(stream_instances.keys()):
                 instance = stream_instances[key]
+                instance["score"].update(recursive_shallow_copy(score))
                 sorted_instances.append(instance)
             result[stream_name] = sorted_instances
                 field="raw_references",
                 to_field="references",
             ),
+            RecursiveCopy(
                 field="source",
                 to_field="task_data/source",
             ),

metrics.py CHANGED Viewed

@@ -8,10 +8,9 @@ import warnings
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
 from dataclasses import field
-from operator import itemgetter
 from typing import Any, Dict, Generator, List, Optional, Tuple, Union
-import evaluate
 import numpy
 import numpy as np
 import pandas as pd
@@ -37,20 +36,18 @@ from .operator import (
     StreamingOperator,
     StreamOperator,
 )
-from .operators import Copy
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
 from .type_utils import Type, isoftype, parse_type_string, to_type_string
-from .utils import deepcopy
 logger = get_logger()
 settings = get_settings()
 warnings.filterwarnings("ignore", category=DegenerateDataWarning)
-warnings.filterwarnings("ignore", category=DegenerateDataWarning)
 def abstract_factory():
     return {}
@@ -139,6 +136,7 @@ class Metric(Artifact):
         return (
             self.score_prefix + score_name
             if score_name not in ["score", "score_name"]
             else score_name
         )
@@ -147,18 +145,24 @@ class Metric(Artifact):
     ) -> Dict[str, Any]:
         new_scores = {}
         for score_name, score in scores.items():
             score_with_prefix = self._add_score_prefix(score_name)
             new_scores[score_with_prefix] = (
                 score if score_name not in ["score_name"] else self.score_prefix + score
             )
         for new_score_name in new_scores:
-            if new_score_name in ["score", "score_name"]:
                 continue
             if new_score_name in existing_scores:
                 UnitxtWarning(
                     message=f"Metric '{new_score_name}' that has just been evaluated to {new_scores[new_score_name]}, is already recorded "
                     f"to have value {existing_scores[new_score_name]} by a previous metric evaluation on this instance or stream. "
-                    f"To avoid overwriting the existing value, add a score_prefix to the metric (e.g. score_prefix='my_second_').",
                     additional_info_id=Documentation.MULTIPLE_METRICS_OUTPUTS,
                 )
         return new_scores
@@ -279,7 +283,12 @@ class Metric(Artifact):
         self, instance: Dict[str, Any], global_score: dict
     ):
         for score_name in global_score:
-            if score_name in ["score", "score_name", "score_ci_low", "score_ci_high"]:
                 continue
             if score_name in instance["score"]["global"]:
                 UnitxtWarning(
@@ -469,11 +478,17 @@ class MetricWithConfidenceInterval(Metric):
             # iterate over the rows and compute the metric on each resampling
             def metric(sample_refs, sample_preds, sample_task_data):
                 try:
-                    return self._compute(
                         references=sample_refs,
                         predictions=sample_preds,
                         task_data=sample_task_data,
-                    )["score"]
                 except Exception as e:
                     # this happens in edge cases, for example, when the sampling creates a
                     # sample where all strings are empty and this fails bleu.
@@ -538,7 +553,6 @@ class GlobalMetric(StreamOperator, MetricWithConfidenceInterval):
         references = []
         predictions = []
         task_data = []
-        global_score = {}
         instances = []
@@ -589,6 +603,7 @@ class GlobalMetric(StreamOperator, MetricWithConfidenceInterval):
                 )
             )
         self._validate_references_and_prediction(references, predictions)
         result = self._compute(references, predictions, task_data)
         global_score.update(
@@ -596,11 +611,18 @@ class GlobalMetric(StreamOperator, MetricWithConfidenceInterval):
                 result, global_score
             )
         )
-        score_name = global_score["score_name"]
-        confidence_interval = self.compute_global_confidence_intervals(
-            references, predictions, task_data, score_name
-        )
-        global_score.update(confidence_interval)
         for instance in instances:
             self.update_and_adjust_global_score(instance, global_score)
@@ -649,28 +671,24 @@ class BulkInstanceMetric(StreamOperator, MetricWithConfidenceInterval):
         default_factory=lambda: ["mean", "weighted_win_rate"]
     )
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
-        global_score = {}
         instances = []
-        # consume the stream
-        references, predictions = map(
-            list,
-            zip(
-                *[
-                    itemgetter("references", "prediction")(
-                        self.verify_instance(instance)
-                    )
-                    for instance in stream
-                ]
-            ),
-        )
         task_data = [
             instance["task_data"] if "task_data" in instance else {}
-            for instance in stream
         ]
         self._validate_references_and_prediction(references, predictions)
         # compute the metric over all refs and preds
         instance_scores = self.compute(
             references=references,
@@ -683,7 +701,7 @@ class BulkInstanceMetric(StreamOperator, MetricWithConfidenceInterval):
             instance_score["score"] = instance_score[self.main_score]
             instance_score["score_name"] = self.main_score
-        for instance, score in zip(stream, instance_scores):
             if "score" not in instance:
                 instance["score"] = {"global": {}, "instance": {}}
@@ -692,7 +710,6 @@ class BulkInstanceMetric(StreamOperator, MetricWithConfidenceInterval):
                     score, instance["score"]["instance"]
                 )
             )
-            instances.append(instance)
         for reduction, fields in self.reduction_map.items():
             assert (
@@ -1059,7 +1076,7 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances = self.compute_instance_scores(stream)
-        global_score = {}
         for reduction_type, reduction_params in self.reduction_map.items():
             assert (
                 reduction_type in self.implemented_reductions
@@ -1096,7 +1113,10 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
                     scores_to_resample,
                     aggregation_function,
                 ) = self._set_up_group_mean_aggregation(
-                    instances, reduction_params, reduction_fields
                 )
             else:
                 raise ValueError(
@@ -1171,13 +1191,16 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
             instance_score["score_name"] = self.main_score
             if "score" not in instance:
                 instance["score"] = {"global": {}, "instance": {}}
             instance["score"]["instance"].update(
                 self._add_score_prefixes_to_score_dict_and_check_against_existing_scores(
                     instance_score, instance["score"]["instance"]
                 )
             )
             instances.append(instance)
         return instances
@@ -1187,7 +1210,9 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
         instances: List[dict],
         score_names: List[str],
         group_aggregation_func,
-        prepend_score_prefix: bool = True,
     ):
         """Group scores by the group_id and subgroup_type fields of each instance, and compute group_aggregation_func by group.
@@ -1199,6 +1224,8 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
                 callable function returns a single score for the group
             prepend_score_prefix: if True - prepend the score_prefix to the score names in the returned dicts. Set to False
                 if down the stream such a prepending is expected.
         Returns:
             List of dicts, each corresponding to a group of instances (defined by 'group_id'),
@@ -1233,8 +1260,27 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
                     ]
                 )
         # if group_aggregation_func expects a subgroup-types score dict, pass it; otherwise pass the default type list of scores
-        return [
             {
                 "score": {
                     "instance": {
@@ -1255,9 +1301,25 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
             )  # sorted for consistency
         ]
     def _set_up_group_mean_aggregation(
-        self, instances, reduction_params, reduction_fields
     ):
         group_aggregation_func = reduction_params["agg_func"][1]
         # if treat groups as units
         do_resample_as_group = reduction_params["agg_func"][2]
@@ -1265,7 +1327,12 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
             # pass the group aggregate---not instance---scores to resample as usual
             aggregation_function = self.average_item_scores
             scores_to_resample = self.get_group_scores(
-                instances, reduction_fields, group_aggregation_func
             )
         else:
             # pass the instance scores to resample, and calculate the group aggregation on the resamplings
@@ -1277,7 +1344,12 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
                 group_aggregation_func=group_aggregation_func,
             ):
                 group_scores = self.get_group_scores(
-                    instances, [field_name], group_aggregation_func, False
                 )
                 return nan_mean(
                     [group["score"]["instance"][field_name] for group in group_scores]
@@ -1315,6 +1387,19 @@ class ANLS(InstanceMetric):
     reduction_map = {"mean": ["anls"]}
     prediction_type = Any  # string representation is compared
     def compute(
         self,
         references: List[Any],
@@ -1324,20 +1409,14 @@ class ANLS(InstanceMetric):
     ) -> dict:
         """ANLS image-text accuracy metric."""
         values = []
-        for answer in references:
-            # preprocess both the answers - gt and prediction
-            gt_answer = " ".join(answer.strip().lower().split())
-            det_answer = " ".join(prediction.strip().lower().split())
-            # dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
-            dist = self.levenshtein_distance(gt_answer, det_answer)
-            length = max(len(answer.upper()), len(prediction.upper()))
-            values.append(0.0 if length == 0 else float(dist) / float(length))
         question_result = 1.0 - min(values)
         if question_result < threshold:
             question_result = 0.0
         result = {}
         result["score"] = question_result
         result[self.main_score] = question_result
@@ -1345,6 +1424,7 @@ class ANLS(InstanceMetric):
         return result
     @staticmethod
     def levenshtein_distance(s1, s2):
         if len(s1) > len(s2):
             s1, s2 = s2, s1
@@ -1526,16 +1606,40 @@ class MetricPipeline(MultiStreamOperator, Metric):
         ), "Must define at most one of postpreprocess_steps (which is deprecated) and postprocess_steps (to be used from now on)"
         if has_postpreprocess:
             self.postprocess_steps = self.postpreprocess_steps
-        self.prepare_score = Copy(
-            field_to_field=[
-                [
-                    f"score/instance/{self.metric._add_score_prefix(self.main_score)}",
-                    "score/instance/score",
-                ],
-                [
-                    f"score/global/{self.metric._add_score_prefix(self.main_score)}",
-                    "score/global/score",
-                ],
             ],
         )
@@ -1589,6 +1693,8 @@ class HuggingfaceMetric(GlobalMetric):
     def prepare(self):
         super().prepare()
         self.metric = evaluate.load(
             self.hf_metric_name, experiment_id=self.experiment_id
         )
@@ -1663,6 +1769,8 @@ class HuggingfaceBulkMetric(BulkInstanceMetric):
     def prepare(self):
         super().prepare()
         self.metric = evaluate.load(
             self.hf_metric_name, experiment_id=str(uuid.uuid4())
         )
@@ -1709,6 +1817,8 @@ class HuggingfaceInstanceMetric(InstanceMetric):
     def prepare(self):
         super().prepare()
         self.metric = evaluate.load(
             self.hf_metric_name, experiment_id=str(uuid.uuid4())
         )
@@ -1788,6 +1898,8 @@ class F1(GlobalMetric):
     def prepare(self):
         super().prepare()
         self._metric = evaluate.load(self.metric, experiment_id=str(uuid.uuid4()))
     def get_str_id(self, str):
@@ -1847,6 +1959,7 @@ class F1Binary(GlobalMetric):
     _metric = None
     metric = "f1"
     single_reference_per_prediction = True
     _requirements_list: List[str] = ["sklearn"]
     def prepare(self):
@@ -2064,6 +2177,8 @@ class F1MultiLabel(GlobalMetric):
     def prepare(self):
         super().prepare()
         self._metric = evaluate.load(
             self.metric, "multilabel", experiment_id=str(uuid.uuid4())
         )
@@ -3033,7 +3148,7 @@ class SafetyMetric(GlobalMetric):
 class LlamaIndexLLMMetric(InstanceMetric):
     model_name: str = ""
     main_score: str = ""
-    prediction_type: str = str
     reduction_map: Dict[str, List[str]] = None
     openai_models: List[str] = ["gpt-3.5-turbo"]
     anthropic_models: List[
@@ -3679,6 +3794,7 @@ class RetrievalAtK(RetrievalMetric):
             (recall_at_k, "recall"),
             (match_at_k, "match"),
         ]:
             max_k = max(measure_array.keys())
             for k in self.k_list:
                 result[self.score_name(measure_name, k)] = measure_array[min(k, max_k)]
@@ -3725,7 +3841,7 @@ class RemoteMetric(StreamOperator, Metric):
         remotely (pre and post processing steps in the MetricPipeline will be computed locally).
         """
         local_inner_metric = metric_pipeline.metric
-        metric_pipeline = deepcopy(
             metric_pipeline
         )  # To avoid unintentional changes to the catalog contents
         metric_pipeline.metric = RemoteMetric(
@@ -4376,6 +4492,7 @@ class BinaryMaxF1(F1Binary):
     main_score = "max_f1_binary"
     single_reference_per_prediction = True
     average = None
     def compute(
         self,
@@ -4799,17 +4916,22 @@ class F1Strings(InstanceMetric):
         "spacy": "Please pip install spacy",
     }
-    def prepare(self):
-        super().prepare()
         import spacy
         try:
-            self.nlp = spacy.load("en_core_web_sm")
         except OSError:
             from spacy.cli import download
             download("en_core_web_sm")
-            self.nlp = spacy.load("en_core_web_sm")
     def compute(
         self,
@@ -4955,3 +5077,20 @@ class RandomForestMetricsEnsemble(MetricsEnsemble):
             )
         score = ensemble_model.predict([prediction_lst])
         return score.tolist()[0]

 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
 from dataclasses import field
+from functools import lru_cache
 from typing import Any, Dict, Generator, List, Optional, Tuple, Union
 import numpy
 import numpy as np
 import pandas as pd
     StreamingOperator,
     StreamOperator,
 )
+from .operators import Copy, Set
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
 from .type_utils import Type, isoftype, parse_type_string, to_type_string
+from .utils import deep_copy
 logger = get_logger()
 settings = get_settings()
 warnings.filterwarnings("ignore", category=DegenerateDataWarning)
 def abstract_factory():
     return {}
         return (
             self.score_prefix + score_name
             if score_name not in ["score", "score_name"]
+            and not score_name.startswith("num_of_instances")
             else score_name
         )
     ) -> Dict[str, Any]:
         new_scores = {}
         for score_name, score in scores.items():
+            if isinstance(score, dict):
+                new_scores[score_name] = score
+                continue  # do not prefix group names
             score_with_prefix = self._add_score_prefix(score_name)
             new_scores[score_with_prefix] = (
                 score if score_name not in ["score_name"] else self.score_prefix + score
             )
         for new_score_name in new_scores:
+            if new_score_name in ["score", "score_name"] or new_score_name.startswith(
+                "num_of_instances"
+            ):
                 continue
             if new_score_name in existing_scores:
                 UnitxtWarning(
                     message=f"Metric '{new_score_name}' that has just been evaluated to {new_scores[new_score_name]}, is already recorded "
                     f"to have value {existing_scores[new_score_name]} by a previous metric evaluation on this instance or stream. "
+                    f"To avoid overwriting the existing value, add a score_prefix to the metric name (e.g. score_prefix='my_second_' , "
+                    f"which will yield, in this case, a score named: 'my_second_{new_score_name}')",
                     additional_info_id=Documentation.MULTIPLE_METRICS_OUTPUTS,
                 )
         return new_scores
         self, instance: Dict[str, Any], global_score: dict
     ):
         for score_name in global_score:
+            if score_name in [
+                "score",
+                "score_name",
+                "score_ci_low",
+                "score_ci_high",
+            ] or score_name.startswith("num_of_instances"):
                 continue
             if score_name in instance["score"]["global"]:
                 UnitxtWarning(
             # iterate over the rows and compute the metric on each resampling
             def metric(sample_refs, sample_preds, sample_task_data):
                 try:
+                    results = self._compute(
                         references=sample_refs,
                         predictions=sample_preds,
                         task_data=sample_task_data,
+                    )
+                    results.update(
+                        self._add_score_prefixes_to_score_dict_and_check_against_existing_scores(
+                            results, {}
+                        )
+                    )
+                    return results[score_name]
                 except Exception as e:
                     # this happens in edge cases, for example, when the sampling creates a
                     # sample where all strings are empty and this fails bleu.
         references = []
         predictions = []
         task_data = []
         instances = []
                 )
             )
         self._validate_references_and_prediction(references, predictions)
+        global_score = {"num_of_instances": len(instances)}
         result = self._compute(references, predictions, task_data)
         global_score.update(
                 result, global_score
             )
         )
+        if self.ci_scores:
+            score_names = [
+                self._add_score_prefix(score_name) for score_name in self.ci_scores
+            ]
+        else:
+            score_names = [global_score["score_name"]]
+        for score_name in score_names:
+            confidence_interval = self.compute_global_confidence_intervals(
+                references, predictions, task_data, score_name
+            )
+            global_score.update(confidence_interval)
         for instance in instances:
             self.update_and_adjust_global_score(instance, global_score)
         default_factory=lambda: ["mean", "weighted_win_rate"]
     )
+    def preprocess_instance(self, instance):
+        return instance
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances = []
+        for instance in stream:
+            self.verify_instance(instance)
+            instance = self.preprocess_instance(instance)
+            instances.append(instance)
+        predictions = [instance["prediction"] for instance in instances]
+        references = [instance["references"] for instance in instances]
         task_data = [
             instance["task_data"] if "task_data" in instance else {}
+            for instance in instances
         ]
         self._validate_references_and_prediction(references, predictions)
+        global_score = {"num_of_instances": len(instances)}
         # compute the metric over all refs and preds
         instance_scores = self.compute(
             references=references,
             instance_score["score"] = instance_score[self.main_score]
             instance_score["score_name"] = self.main_score
+        for instance, score in zip(instances, instance_scores):
             if "score" not in instance:
                 instance["score"] = {"global": {}, "instance": {}}
                     score, instance["score"]["instance"]
                 )
             )
         for reduction, fields in self.reduction_map.items():
             assert (
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         instances = self.compute_instance_scores(stream)
+        global_score = {"num_of_instances": len(instances)}
         for reduction_type, reduction_params in self.reduction_map.items():
             assert (
                 reduction_type in self.implemented_reductions
                     scores_to_resample,
                     aggregation_function,
                 ) = self._set_up_group_mean_aggregation(
+                    instances,
+                    reduction_params,
+                    reduction_fields,
+                    global_score,
                 )
             else:
                 raise ValueError(
             instance_score["score_name"] = self.main_score
             if "score" not in instance:
                 instance["score"] = {"global": {}, "instance": {}}
+            if "global" not in instance["score"]:
+                instance["score"]["global"] = {}
+            if "instance" not in instance["score"]:
+                instance["score"]["instance"] = {}
             instance["score"]["instance"].update(
                 self._add_score_prefixes_to_score_dict_and_check_against_existing_scores(
                     instance_score, instance["score"]["instance"]
                 )
             )
             instances.append(instance)
         return instances
         instances: List[dict],
         score_names: List[str],
         group_aggregation_func,
+        prepend_score_prefix: bool,
+        global_score: dict,
+        aggregation_function_name: str,
     ):
         """Group scores by the group_id and subgroup_type fields of each instance, and compute group_aggregation_func by group.
                 callable function returns a single score for the group
             prepend_score_prefix: if True - prepend the score_prefix to the score names in the returned dicts. Set to False
                 if down the stream such a prepending is expected.
+            global_score: the being built up global score. It will be filled here with number of instances per each group, and group scores.
+            aggregation_function_name: used to annotate the groups' global scores.
         Returns:
             List of dicts, each corresponding to a group of instances (defined by 'group_id'),
                     ]
                 )
+        # count the instances in each group and subgroup.
+        # Each instance goes into group_to_instances per each score_name.
+        # So we count over the first score_name only
+        for group_key in group_to_instance_scores:
+            if group_key not in global_score:
+                global_score[group_key] = {}
+            global_score[group_key]["num_of_instances"] = sum(
+                [
+                    len(
+                        group_to_instance_scores[group_key][score_names[0]][
+                            subgroup_type
+                        ]
+                    )
+                    for subgroup_type in group_to_instance_scores[group_key][
+                        score_names[0]
+                    ]
+                ]
+            )
         # if group_aggregation_func expects a subgroup-types score dict, pass it; otherwise pass the default type list of scores
+        to_return = [
             {
                 "score": {
                     "instance": {
             )  # sorted for consistency
         ]
+        # update each group section in global_score
+        for i, group_name in enumerate(sorted(group_to_instance_scores.keys())):
+            global_score[group_name].update(
+                {
+                    aggregation_function_name + "_" + k: v
+                    for k, v in to_return[i]["score"]["instance"].items()
+                }
+            )
+        return to_return
     def _set_up_group_mean_aggregation(
+        self,
+        instances,
+        reduction_params,
+        reduction_fields,
+        global_score,
     ):
+        aggregation_function_name = str(reduction_params["agg_func"][0])
         group_aggregation_func = reduction_params["agg_func"][1]
         # if treat groups as units
         do_resample_as_group = reduction_params["agg_func"][2]
             # pass the group aggregate---not instance---scores to resample as usual
             aggregation_function = self.average_item_scores
             scores_to_resample = self.get_group_scores(
+                instances=instances,
+                score_names=reduction_fields,
+                group_aggregation_func=group_aggregation_func,
+                prepend_score_prefix=True,
+                global_score=global_score,
+                aggregation_function_name=aggregation_function_name,
             )
         else:
             # pass the instance scores to resample, and calculate the group aggregation on the resamplings
                 group_aggregation_func=group_aggregation_func,
             ):
                 group_scores = self.get_group_scores(
+                    instances=instances,
+                    score_names=[field_name],
+                    group_aggregation_func=group_aggregation_func,
+                    prepend_score_prefix=False,
+                    global_score=global_score,
+                    aggregation_function_name=aggregation_function_name,
                 )
                 return nan_mean(
                     [group["score"]["instance"][field_name] for group in group_scores]
     reduction_map = {"mean": ["anls"]}
     prediction_type = Any  # string representation is compared
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def preprocess_text(text):
+        return " ".join(text.strip().lower().split()), len(text.upper())
+    def distance(self, prediction, reference):
+        processed_reference, len_reference = self.preprocess_text(reference)
+        processed_prediction, len_prediction = self.preprocess_text(prediction)
+        dist = self.levenshtein_distance(processed_reference, processed_prediction)
+        length = max(len_reference, len_prediction)
+        return 0.0 if length == 0 else float(dist) / float(length)
     def compute(
         self,
         references: List[Any],
     ) -> dict:
         """ANLS image-text accuracy metric."""
         values = []
+        for reference in references:
+            values.append(self.distance(prediction, reference))
         question_result = 1.0 - min(values)
         if question_result < threshold:
             question_result = 0.0
         result = {}
         result["score"] = question_result
         result[self.main_score] = question_result
         return result
     @staticmethod
+    @lru_cache(maxsize=10000)
     def levenshtein_distance(s1, s2):
         if len(s1) > len(s2):
             s1, s2 = s2, s1
         ), "Must define at most one of postpreprocess_steps (which is deprecated) and postprocess_steps (to be used from now on)"
         if has_postpreprocess:
             self.postprocess_steps = self.postpreprocess_steps
+        self.prepare_score = SequentialOperator(
+            steps=[
+                Copy(
+                    field=f"score/instance/{self.metric._add_score_prefix(self.main_score)}",
+                    to_field="score/instance/score",
+                ),
+                Copy(
+                    field=f"score/global/{self.metric._add_score_prefix(self.main_score)}",
+                    to_field="score/global/score",
+                ),
+                Copy(
+                    field=f"score/global/{self.metric._add_score_prefix(self.main_score)}_ci_low",
+                    to_field="score/global/score_ci_low",
+                    not_exist_do_nothing=True,
+                ),
+                Copy(
+                    field=f"score/global/{self.metric._add_score_prefix(self.main_score)}_ci_high",
+                    to_field="score/global/score_ci_high",
+                    not_exist_do_nothing=True,
+                ),
+                Set(
+                    fields={
+                        "score/instance/score_name": self.metric._add_score_prefix(
+                            self.main_score
+                        )
+                    }
+                ),
+                Set(
+                    fields={
+                        "score/global/score_name": self.metric._add_score_prefix(
+                            self.main_score
+                        )
+                    }
+                ),
             ],
         )
     def prepare(self):
         super().prepare()
+        import evaluate
         self.metric = evaluate.load(
             self.hf_metric_name, experiment_id=self.experiment_id
         )
     def prepare(self):
         super().prepare()
+        import evaluate
         self.metric = evaluate.load(
             self.hf_metric_name, experiment_id=str(uuid.uuid4())
         )
     def prepare(self):
         super().prepare()
+        import evaluate
         self.metric = evaluate.load(
             self.hf_metric_name, experiment_id=str(uuid.uuid4())
         )
     def prepare(self):
         super().prepare()
+        import evaluate
         self._metric = evaluate.load(self.metric, experiment_id=str(uuid.uuid4()))
     def get_str_id(self, str):
     _metric = None
     metric = "f1"
     single_reference_per_prediction = True
+    ci_scores = [main_score, "f1_binary_neg"]
     _requirements_list: List[str] = ["sklearn"]
     def prepare(self):
     def prepare(self):
         super().prepare()
+        import evaluate
         self._metric = evaluate.load(
             self.metric, "multilabel", experiment_id=str(uuid.uuid4())
         )
 class LlamaIndexLLMMetric(InstanceMetric):
     model_name: str = ""
     main_score: str = ""
+    prediction_type = str
     reduction_map: Dict[str, List[str]] = None
     openai_models: List[str] = ["gpt-3.5-turbo"]
     anthropic_models: List[
             (recall_at_k, "recall"),
             (match_at_k, "match"),
         ]:
+            measure_array[0] = 0.0  # to support cases where the prediction is empty.
             max_k = max(measure_array.keys())
             for k in self.k_list:
                 result[self.score_name(measure_name, k)] = measure_array[min(k, max_k)]
         remotely (pre and post processing steps in the MetricPipeline will be computed locally).
         """
         local_inner_metric = metric_pipeline.metric
+        metric_pipeline = deep_copy(
             metric_pipeline
         )  # To avoid unintentional changes to the catalog contents
         metric_pipeline.metric = RemoteMetric(
     main_score = "max_f1_binary"
     single_reference_per_prediction = True
     average = None
+    ci_scores = [main_score, "max_f1_binary_neg"]
     def compute(
         self,
         "spacy": "Please pip install spacy",
     }
+    def load_spacy(self):
         import spacy
+        self.nlp = spacy.load(
+            "en_core_web_sm", disable=["tagger", "parser", "ner", "lemmatizer"]
+        )
+    def prepare(self):
+        super().prepare()
         try:
+            self.load_spacy()
         except OSError:
             from spacy.cli import download
             download("en_core_web_sm")
+            self.load_spacy()
     def compute(
         self,
             )
         score = ensemble_model.predict([prediction_lst])
         return score.tolist()[0]
+class PredictionLength(InstanceMetric):
+    """Returns the length of the prediction."""
+    main_score = "prediction_length"
+    reduction_map = {"mean": ["prediction_length"]}
+    prediction_type = str
+    single_reference_per_prediction = True
+    def compute(
+        self,
+        references: List[str],
+        prediction: str,
+        task_data: List[Dict],
+    ) -> dict:
+        return {self.main_score: [len(prediction)], "score_name": self.main_score}

operators.py CHANGED Viewed

@@ -39,7 +39,6 @@ General Operators List:
 ------------------------
 """
-import copy
 import operator
 import uuid
 import warnings
@@ -82,14 +81,19 @@ from .operator import (
     StreamOperator,
 )
 from .random_utils import new_random_generator
-from .settings_utils import get_constants, get_settings
-from .stream import DynamicStream, Stream
 from .text_utils import nested_tuple_to_string
 from .type_utils import isoftype
-from .utils import deepcopy, flatten_dict
 settings = get_settings()
-constants = get_constants()
 class FromIterables(StreamInitializerOperator):
@@ -132,8 +136,8 @@ class MapInstanceValues(InstanceOperator):
     it maps values of instances in a stream using predefined mappers.
     Attributes:
-        mappers (Dict[str, Dict[str, str]]): The mappers to use for mapping instance values.
-            Keys are the names of the fields to be mapped, and values are dictionaries
             that define the mapping from old values to new values.
         strict (bool): If True, the mapping is applied strictly. That means if a value
             does not exist in the mapper, it will raise a KeyError. If False, values
@@ -203,13 +207,12 @@ class MapInstanceValues(InstanceOperator):
     def get_mapped_value(self, instance, key, mapper, val):
         val_as_str = str(val)  # make sure the value is a string
-        if self.strict and (val_as_str not in mapper):
             raise KeyError(
                 f"value '{val}' in instance '{instance}' is not found in mapper '{mapper}', associated with field '{key}'."
             )
-        # By default deep copy the value in mapper to avoid shared modifications
-        if val_as_str in mapper:
-            return deepcopy(mapper[val_as_str])
         return val
@@ -269,7 +272,7 @@ class Set(InstanceOperator):
     ) -> Dict[str, Any]:
         for key, value in self.fields.items():
             if self.use_deepcopy:
-                value = deepcopy(value)
             dict_set(instance, key, value)
         return instance
@@ -318,6 +321,13 @@ class SelectFields(InstanceOperator):
         return new_instance
 class InstanceFieldOperator(InstanceOperator):
     """A general stream instance operator that processes the values of a field (or multiple ones).
@@ -348,6 +358,7 @@ class InstanceFieldOperator(InstanceOperator):
     process_every_value: bool = False
     get_default: Any = None
     not_exist_ok: bool = False
     def verify(self):
         super().verify()
@@ -429,19 +440,18 @@ class InstanceFieldOperator(InstanceOperator):
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         self.verify_field_definition()
-        # Need to deep copy instance, because when assigning two dictionary fields,
-        # dict_set() the target field dictionary fields.
-        # This means that if this target field was assigned to another field before,
-        # the field is updated as well.
-        instance = deepcopy(instance)
         for from_field, to_field in self._field_to_field:
             try:
                 old_value = dict_get(
                     instance,
                     from_field,
-                    default=self.get_default,
-                    not_exist_ok=self.not_exist_ok,
                 )
             except Exception as e:
                 raise ValueError(
                     f"Failed to get '{from_field}' from {instance} due to : {e}"
@@ -476,6 +486,13 @@ class FieldOperator(InstanceFieldOperator):
         pass
 class Rename(FieldOperator):
     """Renames fields.
@@ -643,7 +660,9 @@ class ListFieldValues(InstanceOperator):
         values = []
         for field_name in self.fields:
             values.append(dict_get(instance, field_name))
-        instance[self.to_field] = values
         return instance
@@ -680,7 +699,7 @@ class ZipFieldValues(InstanceOperator):
             zipped = zip_longest(*values)
         else:
             zipped = zip(*values)
-        instance[self.to_field] = list(zipped)
         return instance
@@ -847,14 +866,15 @@ class Copy(FieldOperator):
     """
-    use_deep_copy: bool = True
     def process_value(self, value: Any) -> Any:
-        if self.use_deep_copy:
-            return copy.deepcopy(value)
         return value
 @deprecation(version="2.0.0", alternative=Copy)
 class CopyFields(Copy):
     pass
@@ -1022,7 +1042,7 @@ class ArtifactFetcherMixin:
         if artifact_identifier not in cls.cache:
             artifact, artifactory = fetch_artifact(artifact_identifier)
             cls.cache[artifact_identifier] = artifact
-        return copy.deepcopy(cls.cache[artifact_identifier])
 class ApplyOperatorsField(InstanceOperator):
@@ -1602,7 +1622,23 @@ class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         from .metrics import Metric
-        first_instance = stream.peek()
         metric_names = first_instance.get(self.metric_field, [])
         if not metric_names:
@@ -1619,16 +1655,6 @@ class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
         # by the first listed metric (as desired).
         metric_names = list(reversed(metric_names))
-        # Workaround: The metric/MetricPipeline modifies the stream itself, sometimes making it incompatible
-        # for further metrics' processing, instead of just modifying the score field.
-        # Here we keep all the fields besides the score, and restore them after the metric finishes.
-        first_instance = stream.peek()
-        keys_to_restore = set(first_instance.keys()).difference({"score"})
-        multi_stream = MultiStream({stream_name: stream})
-        multi_stream = CopyFields(
-            field_to_field={k: f"{k}_orig" for k in keys_to_restore}
-        )(multi_stream)
         for metric_name in metric_names:
             metric = self.get_artifact(metric_name)
             assert isinstance(
@@ -1637,17 +1663,23 @@ class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
             if not self.calc_confidence_intervals:
                 metric.disable_confidence_interval_calculation()
             multi_stream = metric(multi_stream)
-            multi_stream = CopyFields(
-                field_to_field={f"{k}_orig": k for k in keys_to_restore}
-            )(multi_stream)
-        multi_stream = RemoveFields(fields=[f"{k}_orig" for k in keys_to_restore])(
-            multi_stream
-        )
-        stream = multi_stream[stream_name]
-        yield from stream
 class MergeStreams(MultiStreamOperator):
@@ -2066,7 +2098,7 @@ class DuplicateInstances(StreamOperator):
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         for instance in stream:
             for idx in range(self.num_duplications):
-                duplicate = deepcopy(instance)
                 if self.duplication_index_field:
                     duplicate.update({self.duplication_index_field: idx})
                 yield duplicate

 ------------------------
 """
 import operator
 import uuid
 import warnings
     StreamOperator,
 )
 from .random_utils import new_random_generator
+from .settings_utils import get_settings
+from .stream import DynamicStream, ListStream, Stream
 from .text_utils import nested_tuple_to_string
 from .type_utils import isoftype
+from .utils import (
+    deep_copy,
+    flatten_dict,
+    recursive_copy,
+    recursive_shallow_copy,
+    shallow_copy,
+)
 settings = get_settings()
 class FromIterables(StreamInitializerOperator):
     it maps values of instances in a stream using predefined mappers.
     Attributes:
+        mappers (Dict[str, Dict[str, Any]]): The mappers to use for mapping instance values.
+            Keys are the names of the fields to undergo mapping, and values are dictionaries
             that define the mapping from old values to new values.
         strict (bool): If True, the mapping is applied strictly. That means if a value
             does not exist in the mapper, it will raise a KeyError. If False, values
     def get_mapped_value(self, instance, key, mapper, val):
         val_as_str = str(val)  # make sure the value is a string
+        if val_as_str in mapper:
+            return recursive_copy(mapper[val_as_str])
+        if self.strict:
             raise KeyError(
                 f"value '{val}' in instance '{instance}' is not found in mapper '{mapper}', associated with field '{key}'."
             )
         return val
     ) -> Dict[str, Any]:
         for key, value in self.fields.items():
             if self.use_deepcopy:
+                value = deep_copy(value)
             dict_set(instance, key, value)
         return instance
         return new_instance
+class DefaultPlaceHolder:
+    pass
+default_place_holder = DefaultPlaceHolder()
 class InstanceFieldOperator(InstanceOperator):
     """A general stream instance operator that processes the values of a field (or multiple ones).
     process_every_value: bool = False
     get_default: Any = None
     not_exist_ok: bool = False
+    not_exist_do_nothing: bool = False
     def verify(self):
         super().verify()
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         self.verify_field_definition()
         for from_field, to_field in self._field_to_field:
             try:
                 old_value = dict_get(
                     instance,
                     from_field,
+                    default=default_place_holder,
+                    not_exist_ok=self.not_exist_ok or self.not_exist_do_nothing,
                 )
+                if old_value is default_place_holder:
+                    if self.not_exist_do_nothing:
+                        return instance
+                    old_value = self.get_default
             except Exception as e:
                 raise ValueError(
                     f"Failed to get '{from_field}' from {instance} due to : {e}"
         pass
+class MapValues(FieldOperator):
+    mapping: Dict[str, str]
+    def process_value(self, value: Any) -> Any:
+        return self.mapping[str(value)]
 class Rename(FieldOperator):
     """Renames fields.
         values = []
         for field_name in self.fields:
             values.append(dict_get(instance, field_name))
+        dict_set(instance, self.to_field, values)
         return instance
             zipped = zip_longest(*values)
         else:
             zipped = zip(*values)
+        dict_set(instance, self.to_field, list(zipped))
         return instance
     """
     def process_value(self, value: Any) -> Any:
         return value
+class RecursiveCopy(FieldOperator):
+    def process_value(self, value: Any) -> Any:
+        return recursive_copy(value)
 @deprecation(version="2.0.0", alternative=Copy)
 class CopyFields(Copy):
     pass
         if artifact_identifier not in cls.cache:
             artifact, artifactory = fetch_artifact(artifact_identifier)
             cls.cache[artifact_identifier] = artifact
+        return shallow_copy(cls.cache[artifact_identifier])
 class ApplyOperatorsField(InstanceOperator):
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         from .metrics import Metric
+        # Number of instances in input stream is assumed to be small. This is why
+        # each metric consumes all of them and lays them in its main memory, and even generates
+        # some 1000 copies thereof for the sake of CI.
+        # So we start with deep copying here, to make a 'frozen' status of the stream, having
+        # passed the preprocess_steps of the task, and inference, and now getting to be evaluated,
+        # a frozen status to be fed into each of the metrics listed in metric_field,
+        # so that the evaluation of one does not affect the evaluation of another
+        # (typically, affecting via change of instance as part of
+        # preprocess_steps of MetricPipeline, as illustrated in docs/adding_metrics/Using Metric Pipelines).
+        instances_upon_entrance_to_metrics_evaluations = []
+        for instance in stream:
+            instances_upon_entrance_to_metrics_evaluations.append(
+                recursive_copy(instance)
+            )
+        first_instance = instances_upon_entrance_to_metrics_evaluations[0]
         metric_names = first_instance.get(self.metric_field, [])
         if not metric_names:
         # by the first listed metric (as desired).
         metric_names = list(reversed(metric_names))
         for metric_name in metric_names:
             metric = self.get_artifact(metric_name)
             assert isinstance(
             if not self.calc_confidence_intervals:
                 metric.disable_confidence_interval_calculation()
+            multi_stream = MultiStream(
+                {
+                    "tmp": ListStream(
+                        instances_list=instances_upon_entrance_to_metrics_evaluations,
+                        copying=True,  # ensures deep copy when iterating over instances
+                    )
+                }
+            )
             multi_stream = metric(multi_stream)
+            for evaluated_instance, freezed_instance in zip(
+                multi_stream["tmp"], instances_upon_entrance_to_metrics_evaluations
+            ):
+                freezed_instance["score"] = recursive_shallow_copy(
+                    evaluated_instance["score"]
+                )
+        yield from instances_upon_entrance_to_metrics_evaluations
 class MergeStreams(MultiStreamOperator):
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         for instance in stream:
             for idx in range(self.num_duplications):
+                duplicate = recursive_shallow_copy(instance)
                 if self.duplication_index_field:
                     duplicate.update({self.duplication_index_field: idx})
                 yield duplicate

processors.py CHANGED Viewed

@@ -2,9 +2,12 @@ import ast
 import copy
 import json
 import re
 from difflib import get_close_matches
 from typing import Any, Dict
 from .deprecation_utils import deprecation
 from .operator import MultiStreamOperator
 from .operators import FieldOperator, InstanceFieldOperator
@@ -20,9 +23,9 @@ class PostProcess(MultiStreamOperator):
     def prepare(self):
         super().prepare()
-        self.prediction_operator = copy.deepcopy(self.operator)
         self.prediction_operator.field = "prediction"
-        self.references_operator = copy.deepcopy(self.operator)
         self.references_operator.field = "references"
         self.references_operator.process_every_value = True
         self.references_operator.dont_apply_to_streams = [constants.inference_stream]
@@ -315,3 +318,75 @@ class ExtractArenaHardNumericalJudgment(FieldOperator):
         except:
             return 0

 import copy
 import json
 import re
+import string
 from difflib import get_close_matches
 from typing import Any, Dict
+import numpy as np
 from .deprecation_utils import deprecation
 from .operator import MultiStreamOperator
 from .operators import FieldOperator, InstanceFieldOperator
     def prepare(self):
         super().prepare()
+        self.prediction_operator = copy.copy(self.operator)
         self.prediction_operator.field = "prediction"
+        self.references_operator = copy.copy(self.operator)
         self.references_operator.field = "references"
         self.references_operator.process_every_value = True
         self.references_operator.dont_apply_to_streams = [constants.inference_stream]
         except:
             return 0
+class InferDictsToBinaryLogprobs(FieldOperator):
+    neg_class_name: str
+    pos_class_name: str
+    take_logprobs_from_end: bool = False
+    num_logprobs_to_take: int = 3
+    min_probability_mass = 0.0001
+    def verify(self):
+        super().verify()
+        if (
+            self.neg_class_name.lower() in self.pos_class_name.lower()
+            or self.pos_class_name.lower() in self.neg_class_name.lower()
+        ):
+            raise ValueError(
+                f"""Class names in {self.__class__.__name__} should not overlap, got "{self.pos_class_name}" and "{self.neg_class_name}"""
+            )
+    def process_value(self, obj: Any) -> Any:
+        for i in self.get_token_range(obj):
+            try:
+                pos_probs, neg_probs = self.get_pos_neg_probs(pred_dict=obj[i])
+                if pos_probs or neg_probs:
+                    sum_probs = sum(pos_probs) + sum(neg_probs)
+                    if sum_probs > self.min_probability_mass:
+                        return sum(pos_probs) / sum_probs
+            except:
+                pass
+        return 0
+    def get_pos_neg_probs(self, pred_dict):
+        token_logprobs = pred_dict["top_tokens"]
+        pos_and_neg_probs = []
+        for class_name in [self.pos_class_name, self.neg_class_name]:
+            # We need to capture different variants of model behavior and tokenizers, for example with opening space,
+            # punctuation etc. but avoid longer words that contain the class name.
+            # For example, for class "yes" we would capture "YES," and " Yes" but not "yesterday".
+            name_regex = re.compile(
+                rf"(\W|Ġ|_)*{class_name}(\W|Ġ|_)*", flags=re.IGNORECASE
+            )
+            class_probs = [
+                np.exp(d["logprob"])
+                for d in token_logprobs
+                if name_regex.fullmatch(d["text"])
+            ]
+            pos_and_neg_probs.append(class_probs)
+        return pos_and_neg_probs
+    def get_token_range(self, obj: Any) -> range:
+        n_tokens = min([self.num_logprobs_to_take, len(obj)])
+        if self.take_logprobs_from_end:
+            return range(-1, -(n_tokens + 1), -1)
+        return range(n_tokens)
+class RemoveArticles(FieldOperator):
+    def process_value(self, text: Any) -> Any:
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+class RemovePunctuations(FieldOperator):
+    def process_value(self, text: Any) -> Any:
+        puncs_to_exclude = set(string.punctuation)
+        return "".join(c for c in text if c not in puncs_to_exclude)
+class FixWhiteSpace(FieldOperator):
+    def process_value(self, text: Any) -> Any:
+        return " ".join(text.split())

settings_utils.py CHANGED Viewed

@@ -147,6 +147,7 @@ if Settings.is_uninitilized():
     settings.skip_artifacts_prepare_and_verify = (bool, False)
     settings.data_classification_policy = None
     settings.mock_inference_mode = (bool, False)
 if Constants.is_uninitilized():
     constants = Constants()

     settings.skip_artifacts_prepare_and_verify = (bool, False)
     settings.data_classification_policy = None
     settings.mock_inference_mode = (bool, False)
+    settings.disable_hf_datasets_cache = (bool, True)
 if Constants.is_uninitilized():
     constants = Constants()

split_utils.py CHANGED Viewed

@@ -226,7 +226,12 @@ def rename_split(input_streams: Dict[str, Stream], mapping: Dict[str, str]):
         dict: A dictionary containing the generated new streams, where each key is the name
     of the new stream and the value is a generator representing the stream.
     """
-    return {mapping.get(key, key): val for key, val in input_streams.items()}
 def random_mix_generator(

         dict: A dictionary containing the generated new streams, where each key is the name
     of the new stream and the value is a generator representing the stream.
     """
+    new_streams = {}
+    for key, val in mapping.items():
+        if key not in input_streams:
+            raise ValueError("Wrong stream name")
+        new_streams[val] = input_streams.pop(key)
+    return {**input_streams, **new_streams}
 def random_mix_generator(

splitters.py CHANGED Viewed

@@ -16,7 +16,7 @@ from .split_utils import (
 )
 from .stream import EmptyStreamError, FaultyStreamError, MultiStream
 from .type_utils import isoftype
-from .utils import deepcopy
 class Splitter(MultiStreamOperator):
@@ -353,7 +353,9 @@ class Sample(InstanceOperatorWithMultiStreamAccess):
         sample_size = self.get_sample_size(instance)
         try:
             if self.local_cache is None:
-                self.local_cache = deepcopy(list(multi_stream[self.from_stream]))
             source_stream = self.local_cache
             source_stream = self.sampler.filter_source_by_instance(

 )
 from .stream import EmptyStreamError, FaultyStreamError, MultiStream
 from .type_utils import isoftype
+from .utils import recursive_shallow_copy
 class Splitter(MultiStreamOperator):
         sample_size = self.get_sample_size(instance)
         try:
             if self.local_cache is None:
+                self.local_cache = recursive_shallow_copy(
+                    list(multi_stream[self.from_stream])
+                )
             source_stream = self.local_cache
             source_stream = self.sampler.filter_source_by_instance(

standard.py CHANGED Viewed

@@ -249,12 +249,12 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
     def produce(self, task_instances):
         """Use the recipe in production to produce model ready query from standard task instance."""
         self.before_process_multi_stream()
-        multi_stream = MultiStream.from_iterables(
-            {
-                constants.inference_stream: self.production_preprocess(task_instances),
-                self.demos_pool_name: self.production_demos_pool(),
-            }
-        )
         multi_stream = self.inference(multi_stream)
         return list(multi_stream[constants.inference_stream])

     def produce(self, task_instances):
         """Use the recipe in production to produce model ready query from standard task instance."""
         self.before_process_multi_stream()
+        streams = {
+            constants.inference_stream: self.production_preprocess(task_instances),
+        }
+        if self.use_demos:
+            streams[self.demos_pool_name] = self.production_demos_pool()
+        multi_stream = MultiStream.from_iterables(streams)
         multi_stream = self.inference(multi_stream)
         return list(multi_stream[constants.inference_stream])

stream.py CHANGED Viewed

@@ -10,7 +10,7 @@ from .dataclass import Dataclass, OptionalField
 from .generator_utils import CopyingReusableGenerator, ReusableGenerator
 from .logging_utils import get_logger
 from .settings_utils import get_settings
-from .utils import deepcopy
 settings = get_settings()
 logger = get_logger()
@@ -40,7 +40,7 @@ class ListStream(Stream):
     def __iter__(self):
         if self.copying:
-            return iter(deepcopy(self.instances_list))
         return iter(self.instances_list)
     def peek(self):
@@ -244,7 +244,8 @@ class MultiStream(dict):
         return IterableDatasetDict(
             {
                 key: IterableDataset.from_generator(
-                    self.get_generator, gen_kwargs={"key": key}
                 )
                 for key in self.keys()
             }

 from .generator_utils import CopyingReusableGenerator, ReusableGenerator
 from .logging_utils import get_logger
 from .settings_utils import get_settings
+from .utils import recursive_copy
 settings = get_settings()
 logger = get_logger()
     def __iter__(self):
         if self.copying:
+            return iter(recursive_copy(self.instances_list))
         return iter(self.instances_list)
     def peek(self):
         return IterableDatasetDict(
             {
                 key: IterableDataset.from_generator(
+                    self.get_generator,
+                    gen_kwargs={"key": key},
                 )
                 for key in self.keys()
             }

stream_operators.py CHANGED Viewed

@@ -31,6 +31,7 @@ The rest of this section is dedicated for operators that operates on streams.
 """
 from typing import (
     List,
     Literal,
@@ -154,6 +155,7 @@ class DuplicateSplit(MultiStreamOperator):
     def process(self, multi_stream: MultiStream) -> MultiStream:
         assert self.split in multi_stream
-        generators = multi_stream
-        generators[self.to_split] = generators[self.split]
-        return MultiStream(generators)

 """
+import copy
 from typing import (
     List,
     Literal,
     def process(self, multi_stream: MultiStream) -> MultiStream:
         assert self.split in multi_stream
+        new_stream = copy.deepcopy(multi_stream[self.split])
+        new_stream.set_copying(copying=True)
+        multi_stream[self.to_split] = new_stream
+        return multi_stream

string_operators.py CHANGED Viewed

@@ -87,3 +87,12 @@ class Replace(FieldOperator):
     def process_value(self, value: str) -> str:
         return value.replace(self.old, self.new)

     def process_value(self, value: str) -> str:
         return value.replace(self.old, self.new)
+class MapReplace(FieldOperator):
+    mapping: Dict[str, str]
+    def process_value(self, value: Any) -> Any:
+        for key, val in self.mapping.items():
+            value = value.replace(key, val)
+        return value

struct_data_operators.py CHANGED Viewed

@@ -32,7 +32,7 @@ from .operators import FieldOperator, InstanceOperator
 from .random_utils import new_random_generator
 from .serializers import TableSerializer
 from .types import Table
-from .utils import deepcopy
 def shuffle_columns(table: Table, seed=0) -> Table:
@@ -76,7 +76,7 @@ class SerializeTable(ABC, TableSerializer):
     shuffle_columns: bool = False
     def serialize(self, value: Table, instance: Dict[str, Any]) -> str:
-        value = deepcopy(value)
         if self.shuffle_columns:
             value = shuffle_columns(table=value, seed=self.seed)
@@ -207,6 +207,12 @@ class SerializeTableAsDFLoader(SerializeTable):
         assert header and rows, "Incorrect input table format"
         # Create a pandas DataFrame
         df = pd.DataFrame(rows, columns=header)
@@ -252,6 +258,59 @@ class SerializeTableAsJson(SerializeTable):
         return json.dumps(output_dict)
 # truncate cell value to maximum allowed length
 def truncate_cell(cell_value, max_len):
     if cell_value is None:
@@ -490,7 +549,7 @@ class ConvertTableColNamesToSequential(FieldOperator):
     """
     def process_value(self, table: Any) -> Any:
-        table_input = deepcopy(table)
         return self.replace_header(table_content=table_input)
     # replaces header with sequential column names
@@ -523,7 +582,7 @@ class ShuffleTableRows(FieldOperator):
     """
     def process_value(self, table: Any) -> Any:
-        table_input = deepcopy(table)
         return shuffle_rows(table_input)
@@ -544,7 +603,7 @@ class ShuffleTableColumns(FieldOperator):
     """
     def process_value(self, table: Any) -> Any:
-        table_input = deepcopy(table)
         return shuffle_columns(table_input)
@@ -658,3 +717,133 @@ class ConstructTableFromRowsCols(InstanceOperator):
         instance[self.to_field] = output_dict
         return instance

 from .random_utils import new_random_generator
 from .serializers import TableSerializer
 from .types import Table
+from .utils import recursive_copy
 def shuffle_columns(table: Table, seed=0) -> Table:
     shuffle_columns: bool = False
     def serialize(self, value: Table, instance: Dict[str, Any]) -> str:
+        value = recursive_copy(value)
         if self.shuffle_columns:
             value = shuffle_columns(table=value, seed=self.seed)
         assert header and rows, "Incorrect input table format"
+        # Fix duplicate columns, ensuring the first occurrence has no suffix
+        header = [
+            f"{col}_{header[:i].count(col)}" if header[:i].count(col) > 0 else col
+            for i, col in enumerate(header)
+        ]
         # Create a pandas DataFrame
         df = pd.DataFrame(rows, columns=header)
         return json.dumps(output_dict)
+class SerializeTableAsHTML(SerializeTable):
+    """HTML Table Serializer.
+    HTML table format used for rendering tables in web pages.
+    Format(Sample):
+    <table>
+        <thead>
+            <tr><th>name</th><th>age</th><th>sex</th></tr>
+        </thead>
+        <tbody>
+            <tr><td>Alice</td><td>26</td><td>F</td></tr>
+            <tr><td>Raj</td><td>34</td><td>M</td></tr>
+        </tbody>
+    </table>
+    """
+    # main method that serializes a table.
+    # table_content must be in the prescribed input format.
+    def serialize_table(self, table_content: Dict) -> str:
+        # Extract headers and rows from the dictionary
+        header = table_content.get("header", [])
+        rows = table_content.get("rows", [])
+        assert header and rows, "Incorrect input table format"
+        # Build the HTML table structure
+        serialized_tbl_str = "<table>\n"
+        serialized_tbl_str += self.process_header(header) + "\n"
+        serialized_tbl_str += self.process_rows(rows) + "\n"
+        serialized_tbl_str += "</table>"
+        return serialized_tbl_str.strip()
+    # serialize the header into an HTML <thead> section
+    def process_header(self, header: List) -> str:
+        header_html = "  <thead>\n    <tr>"
+        for col in header:
+            header_html += f"<th>{col}</th>"
+        header_html += "</tr>\n  </thead>"
+        return header_html
+    # serialize the rows into an HTML <tbody> section
+    def process_rows(self, rows: List[List]) -> str:
+        rows_html = "  <tbody>"
+        for row in rows:
+            rows_html += "\n    <tr>"
+            for cell in row:
+                rows_html += f"<td>{cell}</td>"
+            rows_html += "</tr>"
+        rows_html += "\n  </tbody>"
+        return rows_html
 # truncate cell value to maximum allowed length
 def truncate_cell(cell_value, max_len):
     if cell_value is None:
     """
     def process_value(self, table: Any) -> Any:
+        table_input = recursive_copy(table)
         return self.replace_header(table_content=table_input)
     # replaces header with sequential column names
     """
     def process_value(self, table: Any) -> Any:
+        table_input = recursive_copy(table)
         return shuffle_rows(table_input)
     """
     def process_value(self, table: Any) -> Any:
+        table_input = recursive_copy(table)
         return shuffle_columns(table_input)
         instance[self.to_field] = output_dict
         return instance
+class TransposeTable(FieldOperator):
+    """Transpose a table.
+    Sample Input:
+        {
+            "header": ["name", "age", "sex"],
+            "rows": [["Alice", 26, "F"], ["Raj", 34, "M"], ["Donald", 39, "M"]],
+        }
+    Sample Output:
+        {
+            "header": [" ", "0", "1", "2"],
+            "rows": [["name", "Alice", "Raj", "Donald"], ["age", 26, 34, 39], ["sex", "F", "M", "M"]],
+        }
+    """
+    def process_value(self, table: Any) -> Any:
+        return self.transpose_table(table)
+    def transpose_table(self, table: Dict) -> Dict:
+        # Extract the header and rows from the table object
+        header = table["header"]
+        rows = table["rows"]
+        # Transpose the table by converting rows as columns and vice versa
+        transposed_header = [" "] + [str(i) for i in range(len(rows))]
+        transposed_rows = [
+            [header[i]] + [row[i] for row in rows] for i in range(len(header))
+        ]
+        return {"header": transposed_header, "rows": transposed_rows}
+class DuplicateTableRows(FieldOperator):
+    """Duplicates specific rows of a table for the given number of times.
+    Args:
+        row_indices (List[int]) - rows to be duplicated
+        times(int) - how many times to duplicate
+    """
+    row_indices: List[int] = []
+    times: int = 1
+    def process_value(self, table: Any) -> Any:
+        # Extract the header and rows from the table
+        header = table["header"]
+        rows = table["rows"]
+        # Duplicate only the specified rows
+        duplicated_rows = []
+        for i, row in enumerate(rows):
+            if i in self.row_indices:
+                duplicated_rows.extend(
+                    [row] * self.times
+                )  # Duplicate the selected rows
+            else:
+                duplicated_rows.append(row)  # Leave other rows unchanged
+        # Return the new table with selectively duplicated rows
+        return {"header": header, "rows": duplicated_rows}
+class DuplicateTableColumns(FieldOperator):
+    """Duplicates specific columns of a table for the given number of times.
+    Args:
+        column_indices (List[int]) - columns to be duplicated
+        times(int) - how many times to duplicate
+    """
+    column_indices: List[int] = []
+    times: int = 1
+    def process_value(self, table: Any) -> Any:
+        # Extract the header and rows from the table
+        header = table["header"]
+        rows = table["rows"]
+        # Duplicate the specified columns in the header
+        duplicated_header = []
+        for i, col in enumerate(header):
+            if i in self.column_indices:
+                duplicated_header.extend([col] * self.times)
+            else:
+                duplicated_header.append(col)
+        # Duplicate the specified columns in each row
+        duplicated_rows = []
+        for row in rows:
+            new_row = []
+            for i, value in enumerate(row):
+                if i in self.column_indices:
+                    new_row.extend([value] * self.times)
+                else:
+                    new_row.append(value)
+            duplicated_rows.append(new_row)
+        # Return the new table with selectively duplicated columns
+        return {"header": duplicated_header, "rows": duplicated_rows}
+class InsertEmptyTableRows(FieldOperator):
+    """Inserts empty rows in a table randomly for the given number of times.
+    Args:
+        times(int) - how many times to insert
+    """
+    times: int = 0
+    def process_value(self, table: Any) -> Any:
+        # Extract the header and rows from the table
+        header = table["header"]
+        rows = table["rows"]
+        # Insert empty rows at random positions
+        for _ in range(self.times):
+            empty_row = [""] * len(
+                header
+            )  # Create an empty row with the same number of columns
+            insert_pos = random.randint(
+                0, len(rows)
+            )  # Get a random position to insert the empty row created
+            rows.insert(insert_pos, empty_row)
+        # Return the modified table
+        return {"header": header, "rows": rows}

templates.py CHANGED Viewed

@@ -210,7 +210,7 @@ class ApplyTemplate(InstanceOperator):
             if self.demos_field not in instance:
                 raise ValueError("Demos field is missing.")
             instance[self.demos_field] = [
-                self.apply(template, demo_instance, stream_name)
                 for demo_instance in instance[self.demos_field]
             ]
         dict_set(instance, "recipe_metadata/template", template)

             if self.demos_field not in instance:
                 raise ValueError("Demos field is missing.")
             instance[self.demos_field] = [
+                self.apply(template, demo_instance)
                 for demo_instance in instance[self.demos_field]
             ]
         dict_set(instance, "recipe_metadata/template", template)

type_utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ import io
 import itertools
 import re
 import typing
 from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
 from .utils import safe_eval
@@ -810,6 +811,7 @@ class NormalizedType(typing.NamedTuple):
         return f"{self.origin}[{self.args}])"
 def _normalize_args(tps: TypeArgs):
     if isinstance(tps, str):
         return tps
@@ -918,6 +920,7 @@ def _is_origin_subtype_args(
     return _is_normal_subtype(left, right, forward_refs)
 def _is_normal_subtype(
     left: NormalizedType,
     right: NormalizedType,

 import itertools
 import re
 import typing
+from functools import lru_cache
 from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
 from .utils import safe_eval
         return f"{self.origin}[{self.args}])"
+@lru_cache(maxsize=None)
 def _normalize_args(tps: TypeArgs):
     if isinstance(tps, str):
         return tps
     return _is_normal_subtype(left, right, forward_refs)
+@lru_cache(maxsize=None)
 def _is_normal_subtype(
     left: NormalizedType,
     right: NormalizedType,

utils.py CHANGED Viewed

@@ -148,5 +148,88 @@ def import_module_from_file(file_path):
     return module
-def deepcopy(obj):
     return copy.deepcopy(obj)

     return module
+def deep_copy(obj):
+    """Creates a deep copy of the given object.
+    Args:
+        obj: The object to be deep copied.
+    Returns:
+        A deep copy of the original object.
+    """
     return copy.deepcopy(obj)
+def shallow_copy(obj):
+    """Creates a shallow copy of the given object.
+    Args:
+        obj: The object to be shallow copied.
+    Returns:
+        A shallow copy of the original object.
+    """
+    return copy.copy(obj)
+def recursive_copy(obj, internal_copy=None):
+    """Recursively copies an object with a selective copy method.
+    For `list`, `dict`, and `tuple` types, it recursively copies their contents.
+    For other types, it uses the provided `internal_copy` function if available.
+    Objects without a `copy` method are returned as is.
+    Args:
+        obj: The object to be copied.
+        internal_copy (callable, optional): The copy function to use for non-container objects.
+            If `None`, objects without a `copy` method are returned as is.
+    Returns:
+        The recursively copied object.
+    """
+    # Handle dictionaries
+    if isinstance(obj, dict):
+        return type(obj)(
+            {key: recursive_copy(value, internal_copy) for key, value in obj.items()}
+        )
+    # Handle named tuples
+    if isinstance(obj, tuple) and hasattr(obj, "_fields"):
+        return type(obj)(*(recursive_copy(item, internal_copy) for item in obj))
+    # Handle tuples and lists
+    if isinstance(obj, (tuple, list)):
+        return type(obj)(recursive_copy(item, internal_copy) for item in obj)
+    if internal_copy is None:
+        return obj
+    return internal_copy(obj)
+def recursive_deep_copy(obj):
+    """Performs a recursive deep copy of the given object.
+    This function uses `deep_copy` as the internal copy method for non-container objects.
+    Args:
+        obj: The object to be deep copied.
+    Returns:
+        A recursively deep-copied version of the original object.
+    """
+    return recursive_copy(obj, deep_copy)
+def recursive_shallow_copy(obj):
+    """Performs a recursive shallow copy of the given object.
+    This function uses `shallow_copy` as the internal copy method for non-container objects.
+    Args:
+        obj: The object to be shallow copied.
+    Returns:
+        A recursively shallow-copied version of the original object.
+    """
+    return recursive_copy(obj, shallow_copy)

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.13.1"


1	+ version = "1.14.0"