Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Jun 25

Commit

39b18be

verified ·

1 Parent(s): 99f75f9

Upload folder using huggingface_hub

Browse files

Files changed (30) hide show

api.py +49 -5
artifact.py +28 -18
collections_operators.py +60 -2
dataclass.py +59 -0
dataset.py +1 -1
dialog_operators.py +10 -1
dict_utils.py +1 -1
error_utils.py +254 -12
evaluate_cli.py +56 -3
formats.py +53 -9
fusion.py +14 -16
inference.py +229 -126
llm_as_judge_constants.py +1 -2
loaders.py +107 -81
metric.py +1 -1
metric_utils.py +19 -12
metrics.py +548 -654
operator.py +23 -13
operators.py +79 -58
processors.py +11 -1
schema.py +1 -1
serializers.py +18 -2
settings_utils.py +4 -0
struct_data_operators.py +49 -0
task.py +13 -8
templates.py +13 -1
sql_utils.py → text2sql_utils.py +488 -2
type_utils.py +18 -2
types.py +56 -26
version.py +1 -1

api.py CHANGED Viewed

@@ -11,6 +11,7 @@ from datasets.exceptions import DatasetGenerationError
 from .artifact import fetch_artifact
 from .benchmark import Benchmark
 from .card import TaskCard
 from .dataset_utils import get_dataset_artifact
 from .error_utils import UnitxtError
 from .inference import (
@@ -149,6 +150,36 @@ def create_dataset(
     return load_dataset(card=card, split=split, **kwargs)
 def _source_to_dataset(
     source: SourceOperator,
     split=None,
@@ -157,22 +188,35 @@ def _source_to_dataset(
 ):
     from .dataset import Dataset as UnitxtDataset
     stream = source()
     try:
         ds_builder = UnitxtDataset(
             dataset_name="unitxt",
-            config_name="recipe-" + short_hex_hash(repr(source)),
             version=constants.version,
         )
         if split is not None:
             stream = {split: stream[split]}
         ds_builder._generators = stream
-        ds_builder.download_and_prepare(
-            verification_mode="no_checks",
-            download_mode=None if use_cache else "force_redownload",
-        )
         if streaming:
             return ds_builder.as_streaming_dataset(split=split)

 from .artifact import fetch_artifact
 from .benchmark import Benchmark
 from .card import TaskCard
+from .dataclass import to_dict
 from .dataset_utils import get_dataset_artifact
 from .error_utils import UnitxtError
 from .inference import (
     return load_dataset(card=card, split=split, **kwargs)
+def object_to_str_without_addresses(obj):
+    """Generates a string representation of a Python object while removing memory address references.
+    This function is useful for creating consistent and comparable string representations of objects
+    that would otherwise include memory addresses (e.g., `<object_name at 0x123abc>`), which can vary
+    between executions. By stripping the memory address, the function ensures that the representation
+    is stable and independent of the object's location in memory.
+    Args:
+        obj: Any Python object to be converted to a string representation.
+    Returns:
+        str: A string representation of the object with memory addresses removed if present.
+    Example:
+        ```python
+        class MyClass:
+            pass
+        obj = MyClass()
+        print(str(obj))  # "<__main__.MyClass object at 0x7f8b9d4d6e20>"
+        print(to_str_without_addresses(obj))  # "<__main__.MyClass object>"
+        ```
+    """
+    obj_str = str(obj)
+    if " at 0x" in obj_str:
+        obj_str = obj_str.split(" at 0x")[0] + ">"
+    return obj_str
 def _source_to_dataset(
     source: SourceOperator,
     split=None,
 ):
     from .dataset import Dataset as UnitxtDataset
+    # Generate a unique signature for the source
+    source_signature = json.dumps(
+        to_dict(source, object_to_str_without_addresses), sort_keys=True
+    )
+    config_name = "recipe-" + short_hex_hash(source_signature)
+    # Obtain data stream from the source
     stream = source()
     try:
         ds_builder = UnitxtDataset(
             dataset_name="unitxt",
+            config_name=config_name,  # Dictate the cache name
             version=constants.version,
         )
         if split is not None:
             stream = {split: stream[split]}
         ds_builder._generators = stream
+        try:
+            ds_builder.download_and_prepare(
+                verification_mode="no_checks",
+                download_mode=None if use_cache else "force_redownload",
+            )
+        except DatasetGenerationError as e:
+            if e.__cause__:
+                raise e.__cause__ from None
+            if e.__context__:
+                raise e.__context__ from None
+            raise
         if streaming:
             return ds_builder.as_streaming_dataset(split=split)

artifact.py CHANGED Viewed

@@ -16,13 +16,13 @@ from .dataclass import (
     NonPositionalField,
     fields,
 )
-from .error_utils import Documentation, UnitxtError, UnitxtWarning
 from .logging_utils import get_logger
 from .parsing_utils import (
     separate_inside_and_outside_square_brackets,
 )
 from .settings_utils import get_constants, get_settings
-from .text_utils import camel_to_snake_case, is_camel_case, print_dict_as_yaml
 from .type_utils import isoftype, issubtype
 from .utils import (
     artifacts_json_cache,
@@ -342,8 +342,10 @@ class Artifact(Dataclass):
         self.verify_data_classification_policy()
         self.prepare_args()
         if not settings.skip_artifacts_prepare_and_verify:
-            self.prepare()
-            self.verify()
     def _to_raw_dict(self):
         return {
@@ -367,11 +369,14 @@ class Artifact(Dataclass):
     def to_json(self):
         data = self.to_dict()
         return json_dump(data)
     def to_yaml(self):
         data = self.to_dict()
-        return print_dict_as_yaml(data)
     def serialize(self):
         if self.__id__ is not None:
@@ -449,20 +454,25 @@ class Artifact(Dataclass):
             )
             return instance
-        if not any(
-            data_classification in data_classification_policy
-            for data_classification in instance_data_classification
         ):
-            raise UnitxtError(
-                f"The instance '{instance} 'has the following data classification policy "
-                f"'{instance_data_classification}', however, the artifact '{name}' "
-                f"is only configured to support the data with classification "
-                f"'{data_classification_policy}'. To enable this either change "
-                f"the 'data_classification_policy' attribute of the artifact, "
-                f"or modify the environment variable "
-                f"'UNITXT_DATA_CLASSIFICATION_POLICY' accordingly.",
-                Documentation.DATA_CLASSIFICATION_POLICY,
-            )
         return instance

     NonPositionalField,
     fields,
 )
+from .error_utils import Documentation, UnitxtError, UnitxtWarning, error_context
 from .logging_utils import get_logger
 from .parsing_utils import (
     separate_inside_and_outside_square_brackets,
 )
 from .settings_utils import get_constants, get_settings
+from .text_utils import camel_to_snake_case, is_camel_case
 from .type_utils import isoftype, issubtype
 from .utils import (
     artifacts_json_cache,
         self.verify_data_classification_policy()
         self.prepare_args()
         if not settings.skip_artifacts_prepare_and_verify:
+            with error_context(self, action="Prepare Object"):
+                self.prepare()
+            with error_context(self, action="Verify Object"):
+                self.verify()
     def _to_raw_dict(self):
         return {
     def to_json(self):
         data = self.to_dict()
         return json_dump(data)
     def to_yaml(self):
+        import yaml
         data = self.to_dict()
+        return yaml.dump(data)
     def serialize(self):
         if self.__id__ is not None:
             )
             return instance
+        with error_context(
+            self,
+            action="Sensitive Data Verification",
+            help="https://www.unitxt.ai/en/latest/docs/data_classification_policy.html",
         ):
+            if not any(
+                data_classification in data_classification_policy
+                for data_classification in instance_data_classification
+            ):
+                raise UnitxtError(
+                    f"The instance '{instance} 'has the following data classification policy "
+                    f"'{instance_data_classification}', however, the artifact '{name}' "
+                    f"is only configured to support the data with classification "
+                    f"'{data_classification_policy}'. To enable this either change "
+                    f"the 'data_classification_policy' attribute of the artifact, "
+                    f"or modify the environment variable "
+                    f"'UNITXT_DATA_CLASSIFICATION_POLICY' accordingly.",
+                    Documentation.DATA_CLASSIFICATION_POLICY,
+                )
         return instance

collections_operators.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from typing import Any, Dict, Generator, List, Optional
 from .dict_utils import dict_get, dict_set
 from .operators import FieldOperator, StreamOperator
 from .stream import Stream
 from .utils import recursive_shallow_copy
@@ -13,11 +15,52 @@ class Dictify(FieldOperator):
         return dict(zip(self.with_keys, tup))
 class DictToTuplesList(FieldOperator):
     def process_value(self, dic: Dict) -> Any:
         return list(dic.items())
 class Wrap(FieldOperator):
     inside: str
@@ -64,6 +107,13 @@ class Get(FieldOperator):
         return collection[self.item]
 class DuplicateByList(StreamOperator):
     field: str
     to_field: Optional[str] = None
@@ -91,12 +141,16 @@ class DuplicateBySubLists(StreamOperator):
     field: str
     to_field: Optional[str] = None
     use_deep_copy: bool = False
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         to_field = self.field if self.to_field is None else self.to_field
         for instance in stream:
-            elements = instance[self.field]
-            for i in range(1, len(elements) + 1):
                 if self.use_deep_copy:
                     instance_copy = recursive_shallow_copy(instance)
                     instance_copy[to_field] = elements[:i]
@@ -109,6 +163,10 @@ class DuplicateBySubLists(StreamOperator):
                 yield instance_copy
 class GetLength(FieldOperator):
     def process_value(self, collection: Any) -> Any:
         return len(collection)

+from itertools import zip_longest
 from typing import Any, Dict, Generator, List, Optional
 from .dict_utils import dict_get, dict_set
+from .operator import InstanceOperator
 from .operators import FieldOperator, StreamOperator
 from .stream import Stream
 from .utils import recursive_shallow_copy
         return dict(zip(self.with_keys, tup))
+class Zip(InstanceOperator):
+    fields: List[str]
+    to_field: str
+    def zip(self, values):
+        return list(zip(*values))
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        values = []
+        for field in self.fields:
+            values.append(dict_get(instance, field))
+        dict_set(instance, self.to_field, self.zip(values))
+        return instance
+class ZipLongest(Zip):
+    fields: List[str]
+    fill_value: Any = None
+    def zip(self, values):
+        return list(zip_longest(*values, fillvalue=self.fill_value))
 class DictToTuplesList(FieldOperator):
     def process_value(self, dic: Dict) -> Any:
         return list(dic.items())
+def flatten(container):
+    def _flat_gen(x):
+        for item in x:
+            if isinstance(item, (list, tuple)):
+                yield from _flat_gen(item)
+            else:
+                yield item
+    return type(container)(_flat_gen(container))
+class Flatten(FieldOperator):
+    def process_value(self, value: Any) -> Any:
+        return flatten(value)
 class Wrap(FieldOperator):
     inside: str
         return collection[self.item]
+class Pop(FieldOperator):
+    item: Any = None
+    def process_value(self, collection: Any) -> Any:
+        return collection.pop(self.item)
 class DuplicateByList(StreamOperator):
     field: str
     to_field: Optional[str] = None
     field: str
     to_field: Optional[str] = None
     use_deep_copy: bool = False
+    start: int = 1
+    end: int = 0
+    step: int = 1
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         to_field = self.field if self.to_field is None else self.to_field
         for instance in stream:
+            elements = dict_get(instance, self.field)
+            end = len(elements) + 1 + self.end
+            for i in range(self.start, end, self.step):
                 if self.use_deep_copy:
                     instance_copy = recursive_shallow_copy(instance)
                     instance_copy[to_field] = elements[:i]
                 yield instance_copy
+class ExplodeSubLists(DuplicateBySubLists):
+    pass
 class GetLength(FieldOperator):
     def process_value(self, collection: Any) -> Any:
         return len(collection)

dataclass.py CHANGED Viewed

@@ -297,6 +297,65 @@ def _asdict_inner(obj):
     return copy.deepcopy(obj)
 class DataclassMeta(ABCMeta):
     """Metaclass for Dataclass.

     return copy.deepcopy(obj)
+def to_dict(obj, func=copy.deepcopy, _visited=None):
+    """Recursively converts an object into a dictionary representation while avoiding infinite recursion due to circular references.
+    Args:
+        obj: Any Python object to be converted into a dictionary-like structure.
+        func (Callable, optional): A function applied to non-iterable objects. Defaults to `copy.deepcopy`.
+        _visited (set, optional): A set of object IDs used to track visited objects and prevent infinite recursion.
+    Returns:
+        dict: A dictionary representation of the input object, with supported collections and dataclasses
+        recursively processed.
+    Notes:
+        - Supports dataclasses, named tuples, lists, tuples, and dictionaries.
+        - Circular references are detected using object IDs and replaced by `func(obj)`.
+        - Named tuples retain their original type instead of being converted to dictionaries.
+    """
+    # Initialize visited set on first call
+    if _visited is None:
+        _visited = set()
+    # Get object ID to track visited objects
+    obj_id = id(obj)
+    # If we've seen this object before, return a placeholder to avoid infinite recursion
+    if obj_id in _visited:
+        return func(obj)
+    # For mutable objects, add to visited set before recursing
+    if (
+        isinstance(obj, (dict, list))
+        or is_dataclass(obj)
+        or (isinstance(obj, tuple) and hasattr(obj, "_fields"))
+    ):
+        _visited.add(obj_id)
+    if is_dataclass(obj):
+        return {
+            field.name: to_dict(getattr(obj, field.name), func, _visited)
+            for field in fields(obj)
+        }
+    if isinstance(obj, tuple) and hasattr(obj, "_fields"):  # named tuple
+        return type(obj)(*[to_dict(v, func, _visited) for v in obj])
+    if isinstance(obj, (list, tuple)):
+        return type(obj)([to_dict(v, func, _visited) for v in obj])
+    if isinstance(obj, dict):
+        return type(obj)(
+            {
+                to_dict(k, func, _visited): to_dict(v, func, _visited)
+                for k, v in obj.items()
+            }
+        )
+    return func(obj)
 class DataclassMeta(ABCMeta):
     """Metaclass for Dataclass.

dataset.py CHANGED Viewed

@@ -59,7 +59,6 @@ from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
 from .splitters import __file__ as _
-from .sql_utils import __file__ as _
 from .standard import __file__ as _
 from .stream import __file__ as _
 from .stream_operators import __file__ as _
@@ -68,6 +67,7 @@ from .struct_data_operators import __file__ as _
 from .system_prompts import __file__ as _
 from .task import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
 from .types import __file__ as _

 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
 from .splitters import __file__ as _
 from .standard import __file__ as _
 from .stream import __file__ as _
 from .stream_operators import __file__ as _
 from .system_prompts import __file__ as _
 from .task import __file__ as _
 from .templates import __file__ as _
+from .text2sql_utils import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
 from .types import __file__ as _

dialog_operators.py CHANGED Viewed

@@ -17,7 +17,16 @@ The format of the dialog is:
 from typing import Any, Dict, List, Optional
 from .formats import SystemFormat
-from .operators import InstanceFieldOperator
 class SerializeDialog(InstanceFieldOperator):

 from typing import Any, Dict, List, Optional
 from .formats import SystemFormat
+from .operators import FieldOperator, InstanceFieldOperator
+class ToDialog(FieldOperator):
+    def process_value(self, value: Any) -> Any:
+        dialog = []
+        for question, answer in value:
+            dialog.append({"role": "user", "content": question})
+            dialog.append({"role": "agent", "content": answer})
+        return dialog
 class SerializeDialog(InstanceFieldOperator):

dict_utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, List, Tuple
 from .text_utils import to_pretty_string
-indx = re.compile(r"^(\d+)$")
 def is_index(string):

 from .text_utils import to_pretty_string
+indx = re.compile(r"^-?\d+$")
 def is_index(string):

error_utils.py CHANGED Viewed

@@ -1,7 +1,11 @@
-from typing import Optional
 from .logging_utils import get_logger
 logger = get_logger()
@@ -29,12 +33,9 @@ class UnitxtError(Exception):
     """Exception raised for Unitxt errors.
     Args:
-        message (str):
-            explanation of the error
-        additional_info_id (Optional[str]):
-            relative path to additional documentation on web
-            If set, should be one of the DOCUMENATION_* constants in the error_utils.py file.
     """
     def __init__(self, message: str, additional_info_id: Optional[str] = None):
@@ -47,14 +48,255 @@ class UnitxtWarning:
     """Object to format warning message to log.
     Args:
-        message (str):
-            explanation of the warning
-        additional_info_id (Optional[str]):
-            relative path to additional documentation on web
-            If set, should be one of the DOCUMENATION_* constants in the error_utils.py file.
     """
     def __init__(self, message: str, additional_info_id: Optional[str] = None):
         if additional_info_id is not None:
             message += additional_info(additional_info_id)
         logger.warning(message)

+import re
+from contextlib import contextmanager
+from typing import Any, Optional
 from .logging_utils import get_logger
+from .settings_utils import get_constants
+constants = get_constants()
 logger = get_logger()
     """Exception raised for Unitxt errors.
     Args:
+        message (str): explanation of the error
+        additional_info_id (Optional[str]): relative path to additional documentation on web
+            If set, should be one of the DOCUMENTATION_* constants in the error_utils.py file.
     """
     def __init__(self, message: str, additional_info_id: Optional[str] = None):
     """Object to format warning message to log.
     Args:
+        message (str): explanation of the warning
+        additional_info_id (Optional[str]): relative path to additional documentation on web
+            If set, should be one of the DOCUMENTATION_* constants in the error_utils.py file.
     """
     def __init__(self, message: str, additional_info_id: Optional[str] = None):
         if additional_info_id is not None:
             message += additional_info(additional_info_id)
         logger.warning(message)
+context_block_title = "🦄 Unitxt Error Context"
+def _visible_length(text: str) -> int:
+    import unicodedata
+    ansi_escape = re.compile(r"\x1b\[[0-9;]*[a-zA-Z]|\x1b\]8;;[^\x1b]*\x1b\\")
+    clean_text = ansi_escape.sub("", text)
+    width = 0
+    for char in clean_text:
+        if (
+            unicodedata.east_asian_width(char) in ("F", "W")
+            or 0x1F300 <= ord(char) <= 0x1F9FF
+        ):
+            width += 2
+        else:
+            width += 1
+    return width
+def _make_object_clickable(
+    full_obj_name: str, display_name: Optional[str] = None
+) -> str:
+    import os
+    if display_name is None:
+        display_name = full_obj_name.split(".")[-1]
+    if full_obj_name.startswith("unitxt."):
+        parts = full_obj_name.split(".")
+        if len(parts) >= 2:
+            module_path = ".".join(parts[:2])
+            doc_url = f"{Documentation.URL}{module_path}.html#{full_obj_name}"
+            if (
+                os.environ.get("TERM_PROGRAM") in ["iTerm.app", "vscode"]
+                or os.environ.get("TERMINAL_EMULATOR") == "JetBrains-JediTerm"
+            ):
+                return f"\033]8;;{doc_url}\033\\{display_name}\033]8;;\033\\"
+            return f"{display_name} ({doc_url})"
+    return display_name
+def _get_existing_context(error: Exception):
+    """Extract existing context from an error if it exists."""
+    if hasattr(error, "__error_context__"):
+        existing = error.__error_context__
+        return (
+            existing["original_message"],
+            existing["context_object"],
+            existing["context"],
+        )
+    return str(error), None, {}
+def _format_object_context(obj: Any) -> Optional[str]:
+    """Format an object for display in error context."""
+    if obj is None:
+        return None
+    if hasattr(obj, "__class__"):
+        class_name = obj.__class__.__name__
+        module_name = getattr(obj.__class__, "__module__", "")
+    else:
+        obj_type = type(obj)
+        class_name = obj_type.__name__
+        module_name = getattr(obj_type, "__module__", "")
+    if module_name:
+        full_name = f"{module_name}.{class_name}"
+        clickable_object = _make_object_clickable(full_name, class_name)
+        return f"Object: {clickable_object}"
+    return f"Object: {class_name}"
+def _make_clickable_link(url: str) -> str:
+    """Create a clickable terminal link."""
+    import os
+    if (
+        os.environ.get("TERM_PROGRAM") in ["iTerm.app", "vscode"]
+        or os.environ.get("TERMINAL_EMULATOR") == "JetBrains-JediTerm"
+    ):
+        return f"\033]8;;{url}\033\\link\033]8;;\033\\"
+    return url
+def _format_help_context(help_docs) -> list:
+    """Format help documentation into context parts."""
+    parts = []
+    if isinstance(help_docs, str):
+        parts.append(f"Help: {_make_clickable_link(help_docs)}")
+    elif isinstance(help_docs, dict):
+        for label, url in help_docs.items():
+            parts.append(f"Help ({label}): {_make_clickable_link(url)}")
+    elif isinstance(help_docs, list):
+        for item in help_docs:
+            if isinstance(item, dict) and len(item) == 1:
+                label, url = next(iter(item.items()))
+                parts.append(f"Help ({label}): {_make_clickable_link(url)}")
+            elif isinstance(item, str):
+                parts.append(f"Help: {_make_clickable_link(item)}")
+    return parts
+def _build_context_parts(context_object: Any, context: dict) -> list:
+    """Build the list of context information parts."""
+    parts = []
+    ordered_keys = [
+        "Python",
+        "Unitxt",
+        "Stage",
+        "Stream",
+        "Index",
+        "Instance",
+        "Object",
+        "Action",
+    ]
+    processed_keys = set()
+    for desired_key in ordered_keys:
+        for actual_key in context.keys():
+            if actual_key.lower() == desired_key.lower():
+                value = (
+                    "unknown" if context[actual_key] is None else context[actual_key]
+                )
+                parts.append(f"{actual_key.replace('_', ' ').title()}: {value}")
+                processed_keys.add(actual_key)
+                break
+    if not any(key.lower() == "object" for key in processed_keys):
+        obj_context = _format_object_context(context_object)
+        if obj_context:
+            parts.append(obj_context)
+    processed_keys.add("help")
+    for key, value in context.items():
+        if key not in processed_keys:
+            value = "unknown" if value is None else value
+            parts.append(f"{key.replace('_', ' ').title()}: {value}")
+    if "help" in context:
+        parts.extend(_format_help_context(context["help"]))
+    else:
+        parts.append(f"Help: {_make_clickable_link(Documentation.URL)}")
+    return parts
+def _create_context_box(parts: list) -> str:
+    """Create a formatted box containing context information."""
+    if not parts:
+        return ""
+    max_width = (
+        max(
+            _visible_length(context_block_title),
+            max(_visible_length(part) for part in parts),
+        )
+        + 4
+    )
+    top_line = "┌" + "─" * max_width + "┐"
+    bottom_line = "└" + "─" * max_width + "┘"
+    lines = [top_line]
+    lines.append(
+        f"│ {context_block_title}{' ' * (max_width - _visible_length(context_block_title) - 1)}│"
+    )
+    lines.append(f"│ {'-' * (max_width - 2)} │")
+    for part in parts:
+        padding = " " * (max_width - _visible_length(part) - 4)
+        lines.append(f"│  - {part}{padding}│")
+    lines.append(bottom_line)
+    return "\n".join(lines)
+def _store_context_attributes(
+    error: Exception, context_object: Any, context: dict, original_message: str
+):
+    """Store context information in error attributes."""
+    error.__error_context__ = {
+        "context_object": context_object,
+        "context": context,
+        "original_message": original_message,
+    }
+    try:
+        error.original_error = type(error)(original_message)
+    except (TypeError, ValueError):
+        error.original_error = Exception(original_message)
+    error.context_object = context_object
+    error.context = context
+def _add_context_to_exception(
+    original_error: Exception, context_object: Any = None, **context
+):
+    """Add context information to an exception by modifying its message."""
+    original_message, existing_object, existing_context = _get_existing_context(
+        original_error
+    )
+    final_context_object = existing_object or context_object
+    final_context = {
+        "Unitxt": constants.version,
+        "Python": constants.python,
+        **existing_context,
+        **context,
+    }
+    context_parts = _build_context_parts(final_context_object, final_context)
+    context_message = _create_context_box(context_parts)
+    _store_context_attributes(
+        original_error, final_context_object, final_context, original_message
+    )
+    if context_parts:
+        formatted_message = f"\n{context_message}\n\n{original_message}"
+        original_error.args = (formatted_message,)
+    else:
+        original_error.args = (original_message,)
+@contextmanager
+def error_context(context_object: Any = None, **context):
+    """Context manager that catches exceptions and re-raises them with additional context.
+    Args:
+        context_object: The object being processed (optional)
+        **context: Any additional context to include in the error message.
+                  You can provide any key-value pairs that help identify where the error occurred.
+                  Special context keys:
+                  - help: Documentation links to help with the error.
+                    Can be a string (single URL), dict (label: URL), or list of URLs/dicts.
+    Examples:
+        with error_context(self, operation="validation", item_id=42):
+            result = process_item(item)
+        with error_context(operation="schema_validation", help="https://docs.example.com/schema"):
+            validate_schema(data)
+        with error_context(processor, step="preprocessing", batch_size=32):
+            results = process_batch(batch)
+    """
+    try:
+        yield
+    except Exception as e:
+        _add_context_to_exception(e, context_object, **context)
+        raise

evaluate_cli.py CHANGED Viewed

@@ -298,9 +298,13 @@ def cli_load_dataset(args: argparse.Namespace) -> HFDataset:
             dataset_query=task_str, **overwrite_args
         )
-    benchmark = Benchmark(subsets=benchmark_subsets)
-    test_dataset = _source_to_dataset(benchmark, split=args.split)
     logger.info(
         f"Dataset loaded successfully. Number of instances: {len(test_dataset)}"
     )
@@ -414,6 +418,8 @@ def initialize_inference_engine(
             chat_kwargs_dict=chat_kwargs_dict,
         )
     # --- Remote Model (CrossProviderInferenceEngine) ---
     elif args.model.lower() == "cross_provider":
         if "model_name" not in model_args_dict:
@@ -444,6 +450,9 @@ def initialize_inference_engine(
             model=remote_model_name,
             **model_args_dict,
         )
     else:
         # This case should not be reached due to argparse choices
         logger.error(
@@ -682,7 +691,7 @@ def _save_results_to_disk(
     # prepend to the results_path name the time in a wat like this: 2025-04-04T11:37:32
-    timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
     results_path = prepend_timestamp_to_path(results_path, timestamp)
     samples_path = prepend_timestamp_to_path(samples_path, timestamp)
@@ -825,5 +834,49 @@ def main():
     logger.info("Unitxt Evaluation CLI finished successfully.")
 if __name__ == "__main__":
     main()

             dataset_query=task_str, **overwrite_args
         )
+    # this hack circumvents an issue with multi-level benchmarks (such Bluebench's translation subset) that fail when wrapped with an additional Benchmark() object.
+    if len(benchmark_subsets) == 1:
+        source = next(iter(benchmark_subsets.values()))
+    else:
+        source = Benchmark(subsets=benchmark_subsets)
+    test_dataset = _source_to_dataset(source, split=args.split)
     logger.info(
         f"Dataset loaded successfully. Number of instances: {len(test_dataset)}"
     )
             chat_kwargs_dict=chat_kwargs_dict,
         )
+        # Keep the actual model name for the results
+        args.model = inference_model.model_name
     # --- Remote Model (CrossProviderInferenceEngine) ---
     elif args.model.lower() == "cross_provider":
         if "model_name" not in model_args_dict:
             model=remote_model_name,
             **model_args_dict,
         )
+        # Keep the actual model name for the results
+        args.model = inference_model.engine.model
     else:
         # This case should not be reached due to argparse choices
         logger.error(
     # prepend to the results_path name the time in a wat like this: 2025-04-04T11:37:32
+    timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
     results_path = prepend_timestamp_to_path(results_path, timestamp)
     samples_path = prepend_timestamp_to_path(samples_path, timestamp)
     logger.info("Unitxt Evaluation CLI finished successfully.")
+def extract_scores(directory):  # pragma: no cover
+    import pandas as pd
+    data = []
+    for filename in sorted(os.listdir(directory)):
+        if filename.endswith("evaluation_results.json"):
+            file_path = os.path.join(directory, filename)
+            try:
+                with open(file_path, encoding="utf-8") as f:
+                    content = json.load(f)
+                    env_info = content.get("environment_info", {})
+                    timestamp = env_info.get("timestamp_utc", "N/A")
+                    model = env_info.get("parsed_arguments", {}).get("model", "N/A")
+                    results = content.get("results", {})
+                    row = {}
+                    row["Model"] = model
+                    row["Timestamp"] = timestamp
+                    row["Average"] = results.get("score", "N/A")
+                    for key in results.keys():
+                        if isinstance(results[key], dict):
+                            score = results[key].get("score", "N/A")
+                            row[key] = score
+                    data.append(row)
+            except Exception as e:
+                logger.error(f"Error parsing results file {filename}: {e}.")
+    return pd.DataFrame(data).sort_values(by="Timestamp", ascending=True)
+def summarize_cli():
+    if len(sys.argv) != 2:
+        logger.error("Usage: python summarize_cli_results.py <results-directory>")
+        sys.exit(1)
+    directory = sys.argv[1]
+    df = extract_scores(directory)
+    logger.info(df.to_markdown(index=False))
 if __name__ == "__main__":
     main()

formats.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import re
 from abc import abstractmethod
 from typing import (
@@ -18,6 +19,7 @@ from .image_operators import image_to_data_url
 from .operator import InstanceOperator
 from .settings_utils import get_constants
 from .type_utils import isoftype
 from .utils import retry_connection_with_exponential_backoff
 constants = get_constants()
@@ -135,6 +137,9 @@ class BaseFormat(Format):
     def _prepare_instance_fields(self, instance) -> Tuple[str]:
         instance_fields = {}
         for field in (
             "source",
             constants.instruction_field,
@@ -170,6 +175,7 @@ class BaseFormat(Format):
         target_prefix: str,
         demos: List[Dict[str, Any]],
         media: Optional[Dict[str, Any]] = None,
     ) -> str:
         """Abstract method for formatting instances in different subclasses.
@@ -256,7 +262,10 @@ class SystemFormat(BaseFormat):
         target_prefix: str,
         demos: List[Dict[str, Any]],
         media: Optional[Dict[str, Any]] = None,
     ) -> str:
         demos_string = ""
         for demo in demos:
             demo_str = self.demo_format.format(
@@ -356,8 +365,18 @@ class ChatAPIFormat(BaseFormat):
             )
         The resulting `messages` is now a dictionary ready for sending to the OpenAI API.
     """
     def to_content(self, text: str, media: Dict[str, Any]) -> Union[str, List[Content]]:
         # Regular expression to find <img> tags with src attribute
         img_tag_pattern = re.compile(
@@ -419,12 +438,15 @@ class ChatAPIFormat(BaseFormat):
         target_prefix: str,
         demos: List[Dict[str, Any]],
         media: Optional[Dict[str, Any]] = None,
     ) -> List[Message]:
         messages = []
-        if system_prompt or instruction:
             system_content = self.to_content(
-                system_prompt + ("\n" if system_prompt != "" else "") + instruction,
                 media,
             )
             messages.append(
@@ -435,13 +457,22 @@ class ChatAPIFormat(BaseFormat):
             )
         for demo_instance in demos:
-            user_content = self.to_content(demo_instance["source"], media)
             assistant_content = self.to_content(
-                target_prefix + demo_instance["target"], media
             )
             messages.extend(
                 [
-                    {"role": "user", "content": user_content},
                     {
                         "role": "assistant",
                         "content": assistant_content,
@@ -449,9 +480,15 @@ class ChatAPIFormat(BaseFormat):
                 ]
             )
-        last_user_content = self.to_content(source, media)
-        messages.extend([{"role": "user", "content": last_user_content}])
         return messages
@@ -463,6 +500,7 @@ class ChatAPIFormat(BaseFormat):
         target_prefix: str,
         demos: List[Dict[str, Any]],
         media: Optional[Dict[str, Any]] = None,
     ) -> Union[str, List[Message]]:
         chat = self.to_chat(
             system_prompt,
@@ -471,6 +509,7 @@ class ChatAPIFormat(BaseFormat):
             target_prefix,
             demos,
             media,
         )
         media["images"] = []
         return chat
@@ -492,6 +531,7 @@ class HFSystemFormat(ChatAPIFormat):
     """
     model_name: str
     _requirements_list = ["transformers", "Jinja2"]
     @retry_connection_with_exponential_backoff(backoff_factor=2)
@@ -509,13 +549,17 @@ class HFSystemFormat(ChatAPIFormat):
         target_prefix: str,
         demos: List[Dict[str, Any]],
         media: Optional[Dict[str, Any]] = None,
     ) -> str:
         chat = self.to_chat(
-            system_prompt, instruction, source, target_prefix, demos, media
         )
         return (
             self.tokenizer.apply_chat_template(
-                chat, tokenize=False, add_generation_prompt=True
             )
             + target_prefix
         )

+import json
 import re
 from abc import abstractmethod
 from typing import (
 from .operator import InstanceOperator
 from .settings_utils import get_constants
 from .type_utils import isoftype
+from .types import Dialog
 from .utils import retry_connection_with_exponential_backoff
 constants = get_constants()
     def _prepare_instance_fields(self, instance) -> Tuple[str]:
         instance_fields = {}
+        if "__turns__" in instance:
+            instance_fields["turns"] = instance["__turns__"]
         for field in (
             "source",
             constants.instruction_field,
         target_prefix: str,
         demos: List[Dict[str, Any]],
         media: Optional[Dict[str, Any]] = None,
+        turns: Optional[Dialog] = None,
     ) -> str:
         """Abstract method for formatting instances in different subclasses.
         target_prefix: str,
         demos: List[Dict[str, Any]],
         media: Optional[Dict[str, Any]] = None,
+        turns: Optional[Dialog] = None,
     ) -> str:
+        if turns is not None and not source:
+            source = json.dumps(turns)
         demos_string = ""
         for demo in demos:
             demo_str = self.demo_format.format(
             )
         The resulting `messages` is now a dictionary ready for sending to the OpenAI API.
+        By default, the instruction in the template is placed in a turn with a 'system' role.
+        However, some chat tokenizers, will not place the default system prompt for the model,
+        if there is turn with an explicit 'system' role.   To keep the default system prompt,
+        set 'place_instruction_in_user_turns=True'.  This will cause the instruction of the template
+        to be placed in a turn with a 'user' role.  Note the instruction will also be placed
+        in every demo turn (if demos are generated.)
     """
+    place_instruction_in_user_turns: bool = False
     def to_content(self, text: str, media: Dict[str, Any]) -> Union[str, List[Content]]:
         # Regular expression to find <img> tags with src attribute
         img_tag_pattern = re.compile(
         target_prefix: str,
         demos: List[Dict[str, Any]],
         media: Optional[Dict[str, Any]] = None,
+        turns: Optional[Dialog] = None,
     ) -> List[Message]:
         messages = []
+        if system_prompt or (instruction and not self.place_instruction_in_user_turns):
             system_content = self.to_content(
+                system_prompt
+                + ("\n" if system_prompt != "" else "")
+                + (instruction if not self.place_instruction_in_user_turns else ""),
                 media,
             )
             messages.append(
             )
         for demo_instance in demos:
+            if "__turns__" in demo_instance:
+                messages.extend(demo_instance["__turns__"])
+            else:
+                text = demo_instance["source"]
+                if instruction and self.place_instruction_in_user_turns:
+                    text = f"{instruction}\n{text}"
+                source_content = self.to_content(text, media)
+                messages.extend([{"role": "user", "content": source_content}])
             assistant_content = self.to_content(
+                target_prefix + demo_instance["target"],
+                media,
             )
             messages.extend(
                 [
                     {
                         "role": "assistant",
                         "content": assistant_content,
                 ]
             )
+        text = source
+        if instruction and self.place_instruction_in_user_turns:
+            text = f"{instruction}\n{text}"
+        if turns is None:
+            last_user_content = self.to_content(text, media)
+            messages.extend([{"role": "user", "content": last_user_content}])
+        else:
+            messages.extend(turns)
         return messages
         target_prefix: str,
         demos: List[Dict[str, Any]],
         media: Optional[Dict[str, Any]] = None,
+        turns: Optional[Dialog] = None,
     ) -> Union[str, List[Message]]:
         chat = self.to_chat(
             system_prompt,
             target_prefix,
             demos,
             media,
+            turns,
         )
         media["images"] = []
         return chat
     """
     model_name: str
+    chat_kwargs_dict: Dict[str, str] = {}
     _requirements_list = ["transformers", "Jinja2"]
     @retry_connection_with_exponential_backoff(backoff_factor=2)
         target_prefix: str,
         demos: List[Dict[str, Any]],
         media: Optional[Dict[str, Any]] = None,
+        turns: Optional[Dialog] = None,
     ) -> str:
         chat = self.to_chat(
+            system_prompt, instruction, source, target_prefix, demos, media, turns
         )
         return (
             self.tokenizer.apply_chat_template(
+                chat,
+                tokenize=False,
+                add_generation_prompt=True,
+                **self.chat_kwargs_dict,
             )
             + target_prefix
         )

fusion.py CHANGED Viewed

@@ -2,6 +2,7 @@ from abc import abstractmethod
 from typing import Dict, Generator, List, Optional, Union
 from .dataclass import NonPositionalField
 from .logging_utils import get_logger
 from .operator import SourceOperator
 from .random_utils import new_random_generator
@@ -92,7 +93,7 @@ class FixedFusion(BaseFusion):
                         max_from_this_split = max_per_this_split
             logger.info(f"Processing {split} from {origin_name}...")
-            try:
                 for instance in multi_stream[split]:
                     if (
                         max_from_this_split is not None
@@ -105,8 +106,6 @@ class FixedFusion(BaseFusion):
                         instance["subset"].insert(0, origin_name)
                     emitted_from_this_split += 1
                     yield instance
-            except Exception as e:
-                raise RuntimeError(f"Exception in subset: {origin_name}") from e
 class WeightedFusion(BaseFusion):
@@ -164,16 +163,15 @@ class WeightedFusion(BaseFusion):
                 weights=[self.named_weights[name] for name in population],
             )[0]
             iterator = iterators[origin_name]
-            try:
-                instance = next(iterator)
-                if isinstance(origin_name, str):
-                    if "subset" not in instance:
-                        instance["subset"] = []
-                    instance["subset"].insert(0, origin_name)
-                total_examples += 1
-                yield instance
-            except StopIteration:
-                iterators.pop(origin_name)
-            except Exception as e:
-                raise RuntimeError(f"Exception in subset: {origin_name}") from e

 from typing import Dict, Generator, List, Optional, Union
 from .dataclass import NonPositionalField
+from .error_utils import error_context
 from .logging_utils import get_logger
 from .operator import SourceOperator
 from .random_utils import new_random_generator
                         max_from_this_split = max_per_this_split
             logger.info(f"Processing {split} from {origin_name}...")
+            with error_context(self, subset=origin_name):
                 for instance in multi_stream[split]:
                     if (
                         max_from_this_split is not None
                         instance["subset"].insert(0, origin_name)
                     emitted_from_this_split += 1
                     yield instance
 class WeightedFusion(BaseFusion):
                 weights=[self.named_weights[name] for name in population],
             )[0]
             iterator = iterators[origin_name]
+            with error_context(self, subset=origin_name):
+                try:
+                    instance = next(iterator)
+                    if isinstance(origin_name, str):
+                        if "subset" not in instance:
+                            instance["subset"] = []
+                        instance["subset"].insert(0, origin_name)
+                    total_examples += 1
+                    yield instance
+                except StopIteration:
+                    iterators.pop(origin_name)

inference.py CHANGED Viewed

@@ -39,7 +39,7 @@ from .artifact import Artifact
 from .base_metric import Metric
 from .dataclass import InternalField, NonPositionalField
 from .deprecation_utils import deprecation
-from .error_utils import UnitxtError, UnitxtWarning
 from .image_operators import (
     EncodeImageToString,
     ImageDataString,
@@ -121,6 +121,8 @@ class TextGenerationInferenceOutput:
         | For example: ``[ {.. "top_tokens": [ {"text": "a", 'logprob': },  {"text": "b", 'logprob': } ....]},
           {.. "top_tokens": [ {"text": "c", 'logprob': },  {"text": "d", 'logprob': } ....]} ]``
         input_tokens (int) : number of input tokens to the model.
         output_tokens (int) : number of output tokens to the model.
@@ -137,6 +139,7 @@ class TextGenerationInferenceOutput:
     """
     prediction: Union[str, List[Dict[str, Any]]]
     input_tokens: Optional[int] = None
     output_tokens: Optional[int] = None
     stop_reason: Optional[str] = None
@@ -186,12 +189,19 @@ class InferenceEngine(Artifact):
     def prepare(self):
         if not settings.mock_inference_mode:
             super().prepare()  # no need to prepare a mock
-            self.prepare_engine()
             if self.use_cache:
                 from diskcache import Cache
                 self._cache = Cache(
-                    settings.inference_engine_cache_path + self.__class__.__name__
                 )
     def __call__(
@@ -199,7 +209,12 @@ class InferenceEngine(Artifact):
         dataset: Union[List[Dict[str, Any]], Dataset],
         return_meta_data: bool = False,
     ) -> Union[ListWithMetadata[str], ListWithMetadata[TextGenerationInferenceOutput]]:
-        return self.infer(dataset=dataset, return_meta_data=return_meta_data)
     def get_instance_cache_key(self, instance):
         instance_key_fields = ["media", "source", "task_data"]
@@ -243,54 +258,69 @@ class InferenceEngine(Artifact):
             result = self._mock_infer(dataset)
         else:
             if self.use_cache:
-                number_of_batches = math.ceil(len(dataset) / self.cache_batch_size)
-                result = []
-                for batch_index, batch in enumerate(
-                    batched(dataset, self.cache_batch_size)
                 ):
-                    cached_results = []
-                    missing_examples = []
-                    for i, item in enumerate(batch):
-                        cache_key = self._get_cache_key(item)
-                        cached_value = self._cache.get(cache_key)
-                        if cached_value is not None:
-                            cached_results.append(
-                                (i, cached_value)
-                            )  # each element is index in batch, and value
-                        else:
-                            missing_examples.append(
-                                (i, item)
-                            )  # each element is index in batch and example
-                    # infare on missing examples only, without indices
-                    logger.info(
-                        f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})"
-                    )
-                    if len(missing_examples) > 0:
-                        inferred_results = self._infer(
-                            [e[1] for e in missing_examples], return_meta_data
-                        )
-                        # recombined to index and value
-                        inferred_results = list(
-                            zip([e[0] for e in missing_examples], inferred_results)
-                        )
-                        # Add missing examples to cache
-                        for (_, item), (_, prediction) in zip(
-                            missing_examples, inferred_results
-                        ):
-                            if prediction is None:
-                                continue
                             cache_key = self._get_cache_key(item)
-                            self._cache[cache_key] = prediction
-                    else:
-                        inferred_results = []
-                    # Combine cached and inferred results in original order
-                    batch_predictions = [
-                        p[1] for p in sorted(cached_results + inferred_results)
-                    ]
-                    result.extend(batch_predictions)
             else:
-                result = self._infer(dataset, return_meta_data)
         return ListWithMetadata(
             result,
             metadata={
@@ -339,7 +369,16 @@ class InferenceEngine(Artifact):
     def to_messages(self, instance):
         if isinstance(instance["source"], list):
-            return instance["source"]
         return [
             {
                 "role": "user",
@@ -521,13 +560,6 @@ class HFInferenceEngineBase(
     def get_engine_id(self):
         return get_model_and_label_id(self.model_name, self.label)
-    def decode_tokens(self, tokens: Sequence, inp_length: int) -> List[str]:
-        return self.processor.decode(tokens[inp_length:], skip_special_tokens=True)
-    @staticmethod
-    def create_string_from_tokens(string_tokens: List[str]) -> str:
-        return "".join(token for token in string_tokens)
     def make_predictions(self, prepared_inputs: Mapping) -> Mapping:
         return self.model.generate(
             **prepared_inputs,
@@ -598,6 +630,7 @@ class HFInferenceEngineBase(
     def get_return_object(
         self,
         output: Union[str, List[Dict[str, Any]]],
         output_tokens: Optional[int],
         inp: Optional[str],
         inp_tokens: Optional[int],
@@ -606,6 +639,7 @@ class HFInferenceEngineBase(
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=output,
                 output_tokens=output_tokens if output_tokens is not None else None,
                 input_text=inp,
                 input_tokens=inp_tokens if inp_tokens is not None else None,
@@ -689,7 +723,8 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
         # cause an error because the data is always on the gpu
         # if torch.cuda.device_count() > 1:
         # assert self.device == torch.device(0)
-        args["device_map"] = "auto"
         # else:
         #     if not self.load_in_8bit:
         #         args["device"] = self.device
@@ -717,15 +752,21 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
             **model_args,
         )
-    def prepare_inputs(self, data: Iterable) -> Mapping:
         tokenizer_kargs = {}
         if isinstance(data[0], list):
-            data = self.processor.apply_chat_template(
-                data,
-                tokenize=False,
-                add_generation_prompt=True,
-                **self.chat_kwargs_dict,
-            )
             tokenizer_kargs["add_special_tokens"] = False
         if self.processor.pad_token is None:
@@ -766,59 +807,71 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
             total=len(dataset) // self.batch_size,
         ):
             # Get the current batch
-            batch_sources = [instance["source"] for instance in batch]
-            # --- Process the current batch ---
-            # 1. Tokenize inputs for the batch
-            tokenized_inputs = self.prepare_inputs(batch_sources)
-            # 2. Determine input length (handle encoder-decoder models)
             input_length = (
                 1
                 if self.model.config.is_encoder_decoder
                 else tokenized_inputs.input_ids.shape[1]
             )
-            # 3. Make predictions for the batch
             predictions = self.make_predictions(tokenized_inputs)
             sequences = predictions.sequences  # Sequences for the current batch
-            # 4. Decode tokens for the batch
-            string_tokens_batch = [
-                self.decode_tokens(sequence, input_length) for sequence in sequences
-            ]
-            # 5. Calculate logprobs or create strings for the batch
-            final_outputs_batch = (
-                self.get_logprobs(predictions, string_tokens_batch)
-                if return_logprobs
-                else [
-                    self.create_string_from_tokens(strings)
-                    for strings in string_tokens_batch
-                ]
-            )
-            # 6. Create return objects for the batch
-            batch_results = [
-                self.get_return_object(
-                    output=final_outputs_batch[
-                        j
-                    ],  # Output for the j-th item in the batch
-                    output_tokens=len(string_tokens_batch[j]),
-                    inp=batch[j]["source"],  # Original input for the j-th item
-                    inp_tokens=len(tokenized_inputs.encodings[j].tokens)
-                    if tokenized_inputs.encodings is not None
-                    else None,
-                    return_meta_data=return_meta_data,
                 )
-                for j in range(
-                    len(sequences)
-                )  # Iterate through items in the current batch
-            ]
-            # Add results from this batch to the overall list
             all_final_outputs.extend(batch_results)
-            # --- End of batch processing ---
         return all_final_outputs
@@ -847,7 +900,10 @@ class HFLlavaInferenceEngine(HFInferenceEngineBase):
         self, sequences: Sequence, scores: Sequence, beam_indices: Optional[int]
     ) -> Sequence:
         if not hasattr(self.model.config, "vocab_size"):
-            self.model.config.vocab_size = self.model.vocab_size
         return super().compute_transition_scores(sequences, scores, beam_indices)
@@ -917,18 +973,35 @@ class HFLlavaInferenceEngine(HFInferenceEngineBase):
             predictions = self.make_predictions(processed_inputs)
-            string_tokens = self.decode_tokens(predictions.sequences[0], input_len)
-            final_outputs = (
-                self.get_logprobs(predictions, [string_tokens])[0]
-                if return_logprobs
-                else self.create_string_from_tokens(string_tokens)
-            )
             results.append(
                 self.get_return_object(
-                    output=final_outputs,
-                    output_tokens=len(string_tokens),
                     inp=instance["source"],
                     inp_tokens=None,
                     return_meta_data=return_meta_data,
@@ -1189,6 +1262,7 @@ class HFPipelineBasedInferenceEngine(
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=output["generated_text"],
                 model_name=self.model_name,
                 inference_type=self.label,
                 input_text=inp,
@@ -1252,10 +1326,13 @@ class MockInferenceEngine(InferenceEngine, LogProbInferenceEngine):
             for instance in dataset
         ]
-    def get_return_object(self, predict_result, instance, return_meta_data):
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=predict_result,
                 input_tokens=len(instance["source"]),
                 output_tokens=len(predict_result),
                 model_name=self.model_name,
@@ -1369,21 +1446,25 @@ class OllamaInferenceEngine(
         return get_model_and_label_id(self.model, self.label)
     def prepare_engine(self):
-        pass
     def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], Dataset],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        import ollama
         args = self.to_dict([StandardAPIParamsMixin])
         results = []
         model = args.pop("model")
         for instance in dataset:
             messages = self.to_messages(instance)
-            response = ollama.chat(
                 messages=messages,
                 model=model,
                 options=args,
@@ -1877,7 +1958,7 @@ class OpenAiInferenceEngine(
                 f"Error predicting instance {messages}:{e}. Returning empty prediction"
             )
             return TextGenerationInferenceOutput(
-                prediction="-", input_tokens=0, output_tokens=0
             )
     @run_with_imap
@@ -1894,10 +1975,12 @@ class OpenAiInferenceEngine(
             top_logprobs_response = response.choices[0].logprobs.content
             pred_output = [
                 {
                     "top_tokens": [
                         {"text": obj.token, "logprob": obj.logprob}
                         for obj in generated_token.top_logprobs
-                    ]
                 }
                 for generated_token in top_logprobs_response
             ]
@@ -1907,15 +1990,21 @@ class OpenAiInferenceEngine(
             logging.error(
                 f"Error predicting instance {messages}:{e}. Returning empty prediction"
             )
-            prediction = [{"top_tokens": [{"text": "-", "logprob": 0}]}]
             return TextGenerationInferenceOutput(
-                prediction=prediction, input_tokens=0, output_tokens=0
             )
     def get_return_object(self, predict_result, response, return_meta_data):
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=predict_result,
                 input_tokens=response.usage.prompt_tokens,
                 output_tokens=response.usage.completion_tokens,
                 model_name=self.model_name,
@@ -1973,7 +2062,12 @@ class RITSInferenceEngine(
     label: str = "rits"
     data_classification_policy = ["public", "proprietary"]
-    model_names_dict = {"microsoft/phi-4": "microsoft-phi-4"}
     def get_default_headers(self):
         return {"RITS_API_KEY": self.credentials["api_key"]}
@@ -2606,6 +2700,7 @@ class WMLInferenceEngineGeneration(WMLInferenceEngineBase, WMLGenerationParamsMi
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=predict_result,
                 input_tokens=result["input_token_count"],
                 output_tokens=result["generated_token_count"],
                 model_name=self.model_name or self.deployment_id,
@@ -2865,6 +2960,8 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
             tool_call = data[idx]["tools"]["tools"] is not None
             output = response["choices"][0][output_type]
             if tool_call:
                 if "tool_calls" in output:
                     func = output["tool_calls"][0]["function"]
@@ -2877,6 +2974,7 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
             results.append(
                 self.get_return_object(
                     prediction,
                     response,
                     str(inp),
                     return_meta_data,
@@ -2885,10 +2983,13 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
         return results
-    def get_return_object(self, predict_result, result, input_text, return_meta_data):
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=predict_result,
                 input_tokens=result["usage"]["prompt_tokens"],
                 output_tokens=len(predict_result)
                 if isinstance(predict_result, list)
@@ -3286,6 +3387,7 @@ class LiteLLMInferenceEngine(
                     prediction = response["choices"][0]["message"]["content"] or ""
             return TextGenerationInferenceOutput(
                 prediction=prediction,
                 input_tokens=usage.get("prompt_tokens"),
                 output_tokens=usage.get("completion_tokens"),
                 model_name=response.get("model", self.model),
@@ -3436,21 +3538,22 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         },
         "rits": {
             "granite-3-8b-instruct": "ibm-granite/granite-3.0-8b-instruct",
             "granite-3-2-8b-instruct": "ibm-granite/granite-3.2-8b-instruct",
             "granite-3-3-8b-instruct": "ibm-granite/granite-3.3-8b-instruct",
-            "llama-3-1-8b-instruct": "meta-llama/llama-3-1-8b-instruct",
             "llama-3-1-70b-instruct": "meta-llama/llama-3-1-70b-instruct",
             "llama-3-1-405b-instruct": "meta-llama/llama-3-1-405b-instruct-fp8",
             "llama-3-1-405b-instruct-fp8": "meta-llama/llama-3-1-405b-instruct-fp8",
             "llama-3-2-11b-vision-instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
             "llama-3-2-90b-vision-instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
-            "llama-4-scout": "llama-4-scout-17b-16e",
-            "llama-4-maverick": "llama-4-mvk-17b-128e-fp8",
             "mistral-large-instruct": "mistralai/mistral-large-instruct-2407",
             "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1",
             "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7B-instruct-v0.1",
-            "deepseek-v3": "deepseek-ai/deepseek-v3-h200",
             "granite-guardian-3-2-3b-a800m": "ibm-granite/granite-guardian-3.2-3b-a800m",
             "granite-guardian-3-2-5b": "ibm-granite/granite-guardian-3.2-5b",
         },

 from .base_metric import Metric
 from .dataclass import InternalField, NonPositionalField
 from .deprecation_utils import deprecation
+from .error_utils import UnitxtError, UnitxtWarning, error_context
 from .image_operators import (
     EncodeImageToString,
     ImageDataString,
         | For example: ``[ {.. "top_tokens": [ {"text": "a", 'logprob': },  {"text": "b", 'logprob': } ....]},
           {.. "top_tokens": [ {"text": "c", 'logprob': },  {"text": "d", 'logprob': } ....]} ]``
+        generated_text (str): The generated text generated by the model (in both _infer and _infer_log_probs calls).
         input_tokens (int) : number of input tokens to the model.
         output_tokens (int) : number of output tokens to the model.
     """
     prediction: Union[str, List[Dict[str, Any]]]
+    generated_text: str
     input_tokens: Optional[int] = None
     output_tokens: Optional[int] = None
     stop_reason: Optional[str] = None
     def prepare(self):
         if not settings.mock_inference_mode:
             super().prepare()  # no need to prepare a mock
+            with error_context(
+                self,
+                stage="Prepare Inference Engine",
+                help="https://www.unitxt.ai/en/latest/docs/inference.html",
+            ):
+                self.prepare_engine()
             if self.use_cache:
                 from diskcache import Cache
                 self._cache = Cache(
+                    os.path.join(
+                        settings.inference_engine_cache_path, self.__class__.__name__
+                    )
                 )
     def __call__(
         dataset: Union[List[Dict[str, Any]], Dataset],
         return_meta_data: bool = False,
     ) -> Union[ListWithMetadata[str], ListWithMetadata[TextGenerationInferenceOutput]]:
+        with error_context(
+            self,
+            stage="Running Inference",
+            help="https://www.unitxt.ai/en/latest/docs/inference.html",
+        ):
+            return self.infer(dataset=dataset, return_meta_data=return_meta_data)
     def get_instance_cache_key(self, instance):
         instance_key_fields = ["media", "source", "task_data"]
             result = self._mock_infer(dataset)
         else:
             if self.use_cache:
+                with error_context(
+                    self,
+                    stage="Inference Cache Handling",
+                    help="https://www.unitxt.ai/en/latest/docs/inference.html",
                 ):
+                    number_of_batches = math.ceil(len(dataset) / self.cache_batch_size)
+                    result = []
+                    for batch_index, batch in enumerate(
+                        batched(dataset, self.cache_batch_size)
+                    ):
+                        cached_results = []
+                        missing_examples = []
+                        for i, item in enumerate(batch):
                             cache_key = self._get_cache_key(item)
+                            cached_value = self._cache.get(cache_key)
+                            if cached_value is not None:
+                                cached_results.append(
+                                    (i, cached_value)
+                                )  # each element is index in batch, and value
+                            else:
+                                missing_examples.append(
+                                    (i, item)
+                                )  # each element is index in batch and example
+                        # infare on missing examples only, without indices
+                        logger.info(
+                            f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})"
+                        )
+                        if len(missing_examples) > 0:
+                            with error_context(
+                                self,
+                                stage="Running Inference",
+                                help="https://www.unitxt.ai/en/latest/docs/inference.html",
+                            ):
+                                inferred_results = self._infer(
+                                    [e[1] for e in missing_examples], return_meta_data
+                                )
+                            # recombined to index and value
+                            inferred_results = list(
+                                zip([e[0] for e in missing_examples], inferred_results)
+                            )
+                            # Add missing examples to cache
+                            for (_, item), (_, prediction) in zip(
+                                missing_examples, inferred_results
+                            ):
+                                if prediction is None:
+                                    continue
+                                cache_key = self._get_cache_key(item)
+                                self._cache[cache_key] = prediction
+                        else:
+                            inferred_results = []
+                        # Combine cached and inferred results in original order
+                        batch_predictions = [
+                            p[1] for p in sorted(cached_results + inferred_results)
+                        ]
+                        result.extend(batch_predictions)
             else:
+                with error_context(
+                    self,
+                    stage="Running Inference",
+                    help="https://www.unitxt.ai/en/latest/docs/inference.html",
+                ):
+                    result = self._infer(dataset, return_meta_data)
         return ListWithMetadata(
             result,
             metadata={
     def to_messages(self, instance):
         if isinstance(instance["source"], list):
+            messages = []
+            for message in instance["source"]:
+                if "tool_calls" in message:
+                    for tool_call in message["tool_calls"]:
+                        if not isinstance(tool_call["function"]["arguments"], str):
+                            tool_call["function"]["arguments"] = json.dumps(
+                                tool_call["function"]["arguments"]
+                            )
+                messages.append(message)
+            return messages
         return [
             {
                 "role": "user",
     def get_engine_id(self):
         return get_model_and_label_id(self.model_name, self.label)
     def make_predictions(self, prepared_inputs: Mapping) -> Mapping:
         return self.model.generate(
             **prepared_inputs,
     def get_return_object(
         self,
         output: Union[str, List[Dict[str, Any]]],
+        generated_text: str,
         output_tokens: Optional[int],
         inp: Optional[str],
         inp_tokens: Optional[int],
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=output,
+                generated_text=generated_text,
                 output_tokens=output_tokens if output_tokens is not None else None,
                 input_text=inp,
                 input_tokens=inp_tokens if inp_tokens is not None else None,
         # cause an error because the data is always on the gpu
         # if torch.cuda.device_count() > 1:
         # assert self.device == torch.device(0)
+        if self.device_map is None:
+            args["device_map"] = "auto"
         # else:
         #     if not self.load_in_8bit:
         #         args["device"] = self.device
             **model_args,
         )
+    def prepare_inputs(self, data: Iterable, tools: Iterable) -> Mapping:
         tokenizer_kargs = {}
         if isinstance(data[0], list):
+            processed = []
+            for item, item_tools in zip(data, tools):
+                processed.append(
+                    self.processor.apply_chat_template(
+                        item,
+                        tokenize=False,
+                        tools=item_tools,
+                        add_generation_prompt=True,
+                        **self.chat_kwargs_dict,
+                    )
+                )
+            data = processed
             tokenizer_kargs["add_special_tokens"] = False
         if self.processor.pad_token is None:
             total=len(dataset) // self.batch_size,
         ):
             # Get the current batch
+            sources = []
+            tools = []
+            for instance in batch:
+                sources.append(instance["source"])
+                if "task_data" in instance and "__tools__" in instance["task_data"]:
+                    task_data = instance["task_data"]
+                    if isinstance(task_data, str):
+                        task_data = json.loads(task_data)
+                    tools.append(task_data["__tools__"])
+                else:
+                    tools.append(None)
+            # Tokenize inputs for the batch
+            tokenized_inputs = self.prepare_inputs(sources, tools)
+            # Determine input length (handle encoder-decoder models)
             input_length = (
                 1
                 if self.model.config.is_encoder_decoder
                 else tokenized_inputs.input_ids.shape[1]
             )
+            # Make predictions for the batch
             predictions = self.make_predictions(tokenized_inputs)
             sequences = predictions.sequences  # Sequences for the current batch
+            output_tokens = sequences[:, input_length:]
+            output_tokens_strings = []
+            for tokens in output_tokens:
+                output_tokens_strings.append(
+                    [
+                        self.processor.decode(token, skip_special_tokens=True)
+                        for token in tokens
+                    ]
+                )
+            output_strings = []
+            for tokens in output_tokens:
+                output_strings.append(
+                    self.processor.decode(tokens, skip_special_tokens=True)
+                )
+            if return_logprobs:
+                outputs = self.get_logprobs(predictions, output_tokens_strings)
+            else:
+                outputs = output_strings
+            # Create return objects for the batch
+            batch_results = []
+            for i in range(len(sequences)):
+                batch_results.append(
+                    self.get_return_object(
+                        output=outputs[i],
+                        generated_text=output_strings[i],
+                        output_tokens=len(output_tokens_strings[i]),
+                        inp=sources[i],
+                        inp_tokens=len(tokenized_inputs.encodings[i].tokens)
+                        if tokenized_inputs.encodings is not None
+                        else None,
+                        return_meta_data=return_meta_data,
+                    )
                 )
             all_final_outputs.extend(batch_results)
         return all_final_outputs
         self, sequences: Sequence, scores: Sequence, beam_indices: Optional[int]
     ) -> Sequence:
         if not hasattr(self.model.config, "vocab_size"):
+            try:
+                self.model.config.vocab_size = self.model.vocab_size
+            except:
+                self.model.config.vocab_size = self.model.config.text_config.vocab_size
         return super().compute_transition_scores(sequences, scores, beam_indices)
             predictions = self.make_predictions(processed_inputs)
+            sequences = predictions.sequences  # Sequences for the current batch
+            output_tokens = sequences[:, input_len:]
+            output_tokens_strings = []
+            for tokens in output_tokens:
+                output_tokens_strings.append(
+                    [
+                        self.processor.decode(token, skip_special_tokens=True)
+                        for token in tokens
+                    ]
+                )
+            output_strings = []
+            for tokens in output_tokens:
+                output_strings.append(
+                    self.processor.decode(tokens, skip_special_tokens=True)
+                )
+            if return_logprobs:
+                final_outputs = self.get_logprobs(predictions, output_tokens_strings)
+            else:
+                final_outputs = output_strings
             results.append(
                 self.get_return_object(
+                    output=final_outputs[0],
+                    generated_text=output_strings,
+                    output_tokens=len(output_tokens_strings[0]),
                     inp=instance["source"],
                     inp_tokens=None,
                     return_meta_data=return_meta_data,
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=output["generated_text"],
+                generated_text=output["generated_text"],
                 model_name=self.model_name,
                 inference_type=self.label,
                 input_text=inp,
             for instance in dataset
         ]
+    def get_return_object(
+        self, predict_result, generated_text, instance, return_meta_data
+    ):
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=predict_result,
+                generated_text=self.default_inference_value,
                 input_tokens=len(instance["source"]),
                 output_tokens=len(predict_result),
                 model_name=self.model_name,
         return get_model_and_label_id(self.model, self.label)
     def prepare_engine(self):
+        from ollama import Client
+        self.client = Client(
+            host=self.credentials["api_base"]
+            if self.credentials is not None and "api_base" in self.credentials
+            else None
+        )
     def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], Dataset],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         args = self.to_dict([StandardAPIParamsMixin])
         results = []
         model = args.pop("model")
         for instance in dataset:
             messages = self.to_messages(instance)
+            response = self.client.chat(
                 messages=messages,
                 model=model,
                 options=args,
                 f"Error predicting instance {messages}:{e}. Returning empty prediction"
             )
             return TextGenerationInferenceOutput(
+                prediction="-", generated_text="-", input_tokens=0, output_tokens=0
             )
     @run_with_imap
             top_logprobs_response = response.choices[0].logprobs.content
             pred_output = [
                 {
+                    "text": generated_token.token,
+                    "logprob": generated_token.logprob,
                     "top_tokens": [
                         {"text": obj.token, "logprob": obj.logprob}
                         for obj in generated_token.top_logprobs
+                    ],
                 }
                 for generated_token in top_logprobs_response
             ]
             logging.error(
                 f"Error predicting instance {messages}:{e}. Returning empty prediction"
             )
+            prediction = [
+                {"text": "-", "logprob": 0, "top_tokens": [{"text": "-", "logprob": 0}]}
+            ]
             return TextGenerationInferenceOutput(
+                prediction=prediction,
+                generated_text=prediction,
+                input_tokens=0,
+                output_tokens=0,
             )
     def get_return_object(self, predict_result, response, return_meta_data):
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=predict_result,
+                generated_text=response.choices[0].message.content,
                 input_tokens=response.usage.prompt_tokens,
                 output_tokens=response.usage.completion_tokens,
                 model_name=self.model_name,
     label: str = "rits"
     data_classification_policy = ["public", "proprietary"]
+    model_names_dict = {
+        "microsoft/phi-4": "microsoft-phi-4",
+        "meta-llama/llama-4-maverick-17b-128e-instruct-fp8": "llama-4-mvk-17b-128e-fp8",
+        "deepseek-ai/DeepSeek-V3": "deepseek-v3-h200",
+        "meta-llama/Llama-3.1-8B-Instruct": "llama-3-1-8b-instruct",
+    }
     def get_default_headers(self):
         return {"RITS_API_KEY": self.credentials["api_key"]}
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=predict_result,
+                generated_text=result["generated_text"],
                 input_tokens=result["input_token_count"],
                 output_tokens=result["generated_token_count"],
                 model_name=self.model_name or self.deployment_id,
             tool_call = data[idx]["tools"]["tools"] is not None
             output = response["choices"][0][output_type]
+            if "content" not in output:
+                output["content"] = ""
             if tool_call:
                 if "tool_calls" in output:
                     func = output["tool_calls"][0]["function"]
             results.append(
                 self.get_return_object(
                     prediction,
+                    response["choices"][0]["message"]["content"],
                     response,
                     str(inp),
                     return_meta_data,
         return results
+    def get_return_object(
+        self, predict_result, generated_text, result, input_text, return_meta_data
+    ):
         if return_meta_data:
             return TextGenerationInferenceOutput(
                 prediction=predict_result,
+                generated_text=generated_text,
                 input_tokens=result["usage"]["prompt_tokens"],
                 output_tokens=len(predict_result)
                 if isinstance(predict_result, list)
                     prediction = response["choices"][0]["message"]["content"] or ""
             return TextGenerationInferenceOutput(
                 prediction=prediction,
+                generated_text=response["choices"][0]["message"]["content"],
                 input_tokens=usage.get("prompt_tokens"),
                 output_tokens=usage.get("completion_tokens"),
                 model_name=response.get("model", self.model),
         },
         "rits": {
             "granite-3-8b-instruct": "ibm-granite/granite-3.0-8b-instruct",
+            "granite-3-1-8b-instruct": "ibm-granite/granite-3.1-8b-instruct",
             "granite-3-2-8b-instruct": "ibm-granite/granite-3.2-8b-instruct",
             "granite-3-3-8b-instruct": "ibm-granite/granite-3.3-8b-instruct",
+            "llama-3-1-8b-instruct": "meta-llama/Llama-3.1-8B-Instruct",
             "llama-3-1-70b-instruct": "meta-llama/llama-3-1-70b-instruct",
             "llama-3-1-405b-instruct": "meta-llama/llama-3-1-405b-instruct-fp8",
             "llama-3-1-405b-instruct-fp8": "meta-llama/llama-3-1-405b-instruct-fp8",
             "llama-3-2-11b-vision-instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
             "llama-3-2-90b-vision-instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
+            "llama-4-scout": "meta-llama/llama-4-scout-17b-16e",
+            "llama-4-maverick": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
             "mistral-large-instruct": "mistralai/mistral-large-instruct-2407",
             "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1",
             "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7B-instruct-v0.1",
+            "deepseek-v3": "deepseek-ai/DeepSeek-V3",
             "granite-guardian-3-2-3b-a800m": "ibm-granite/granite-guardian-3.2-3b-a800m",
             "granite-guardian-3-2-5b": "ibm-granite/granite-guardian-3.2-5b",
         },

llm_as_judge_constants.py CHANGED Viewed

@@ -125,7 +125,7 @@ EVALUATOR_TO_MODEL_ID = {
     EvaluatorNameEnum.GRANITE3_1_8B: "granite-3-1-8b-instruct",
     EvaluatorNameEnum.GRANITE3_2_8B: "granite-3-2-8b-instruct",
     EvaluatorNameEnum.GRANITE3_3_8B: "granite-3-3-8b-instruct",
-    EvaluatorNameEnum.DEEPSEEK_V3: "deepseek-ai/DeepSeek-V3",
     EvaluatorNameEnum.GEMMA_2_5_PRO: "gemma-2-5-pro",
     EvaluatorNameEnum.GEMINI_2_5_FLASH: "gemini-2-5-flash",
 }
@@ -198,7 +198,6 @@ EVALUATORS_METADATA = [
         [
             ModelProviderEnum.WATSONX,
             ModelProviderEnum.TOGETHER_AI,
-            ModelProviderEnum.RITS,
             ModelProviderEnum.OLLAMA,
         ],
     ),

     EvaluatorNameEnum.GRANITE3_1_8B: "granite-3-1-8b-instruct",
     EvaluatorNameEnum.GRANITE3_2_8B: "granite-3-2-8b-instruct",
     EvaluatorNameEnum.GRANITE3_3_8B: "granite-3-3-8b-instruct",
+    EvaluatorNameEnum.DEEPSEEK_V3: "deepseek-v3",
     EvaluatorNameEnum.GEMMA_2_5_PRO: "gemma-2-5-pro",
     EvaluatorNameEnum.GEMINI_2_5_FLASH: "gemini-2-5-flash",
 }
         [
             ModelProviderEnum.WATSONX,
             ModelProviderEnum.TOGETHER_AI,
             ModelProviderEnum.OLLAMA,
         ],
     ),

loaders.py CHANGED Viewed

@@ -66,7 +66,7 @@ from tqdm import tqdm
 from .dataclass import NonPositionalField
 from .dict_utils import dict_get
-from .error_utils import Documentation, UnitxtError, UnitxtWarning
 from .fusion import FixedFusion
 from .logging_utils import get_logger
 from .operator import SourceOperator
@@ -90,23 +90,27 @@ class UnitxtUnverifiedCodeError(UnitxtError):
 @retry_connection_with_exponential_backoff(backoff_factor=2)
 def hf_load_dataset(path: str, *args, **kwargs):
-    if settings.hf_offline_datasets_path is not None:
-        path = os.path.join(settings.hf_offline_datasets_path, path)
-    try:
-        return _hf_load_dataset(
-            path,
-            *args,
-            **kwargs,
-            verification_mode="no_checks",
-            trust_remote_code=settings.allow_unverified_code,
-            download_mode="force_redownload"
-            if settings.disable_hf_datasets_cache
-            else "reuse_dataset_if_exists",
-        )
-    except ValueError as e:
-        if "trust_remote_code" in str(e):
-            raise UnitxtUnverifiedCodeError(path) from e
-        raise e  # Re raise
 @retry_connection_with_exponential_backoff(backoff_factor=2)
@@ -218,13 +222,15 @@ class Loader(SourceOperator):
         pass
     def load_data(self) -> MultiStream:
-        try:
             iterables = self.load_iterables()
-        except Exception as e:
-            raise UnitxtError(f"Error in loader:\n{self}") from e
-        if isoftype(iterables, MultiStream):
-            return iterables
-        return MultiStream.from_iterables(iterables, copying=True)
     def process(self) -> MultiStream:
         self._maybe_set_classification_policy()
@@ -514,9 +520,13 @@ class LoadCSV(LoadWithPandas):
     sep: str = ","
     def read_dataframe(self, file) -> pd.DataFrame:
-        return pd.read_csv(
-            file, sep=self.sep, low_memory=self.streaming, **self.get_args()
-        )
 def read_file(source) -> bytes:
@@ -560,32 +570,36 @@ class LoadJsonFile(LoadWithPandas):
     data_field: Optional[str] = None
     def read_dataframe(self, file) -> pd.DataFrame:
-        args = self.get_args()
-        if not self.lines:
-            data = json.loads(read_file(file))
-            if self.data_field:
-                instances = dict_get(data, self.data_field)
-                if not isoftype(instances, List[Dict[str, Any]]):
-                    raise UnitxtError(
-                        f"{self.data_field} of file {file} is not a list of dictionariess in LoadJsonFile loader"
-                    )
-            else:
-                if isoftype(data, Dict[str, Any]):
-                    instances = [data]
-                elif isoftype(data, List[Dict[str, Any]]):
-                    instances = data
                 else:
                     raise UnitxtError(
-                        f"data of file {file} is not dictionary or a list of dictionaries in LoadJsonFile loader"
                     )
-            dataframe = pd.DataFrame(instances)
-        else:
-            if self.data_field is not None:
-                raise UnitxtError(
-                    "Can not load from a specific 'data_field' when loading multiple lines (lines=True)"
-                )
-            dataframe = pd.read_json(file, lines=self.lines, **args)
-        return dataframe
 class LoadFromSklearn(LazyLoader):
@@ -631,8 +645,12 @@ class LoadFromSklearn(LazyLoader):
         dataset_id = str(self) + "_" + split
         dataset = self.__class__._loader_cache.get(dataset_id, None)
         if dataset is None:
-            split_data = self.downloader(subset=split)
-            targets = [split_data["target_names"][t] for t in split_data["target"]]
             df = pd.DataFrame([split_data["data"], targets]).T
             df.columns = ["data", "target"]
             dataset = df.to_dict("records")
@@ -851,18 +869,22 @@ class LoadFromIBMCloud(Loader):
                     if self.data_dir is not None
                     else data_file
                 )
-                with tempfile.NamedTemporaryFile() as temp_file:
-                    # Download to  a temporary file in same file partition, and then do an atomic move
-                    self._download_from_cos(
-                        cos,
-                        self.bucket_name,
-                        object_key,
-                        local_dir + "/" + os.path.basename(temp_file.name),
-                    )
-                    os.renames(
-                        local_dir + "/" + os.path.basename(temp_file.name),
-                        local_dir + "/" + data_file,
-                    )
         if isinstance(self.data_files, list):
             dataset = hf_load_dataset(local_dir, streaming=False, field=self.data_field)
@@ -946,22 +968,26 @@ class LoadFromDictionary(Loader):
     def verify(self):
         super().verify()
-        if not isoftype(self.data, Dict[str, List[Dict[str, Any]]]):
-            raise ValueError(
-                f"Passed data to LoadFromDictionary is not of type Dict[str, List[Dict[str, Any]]].\n"
-                f"Expected data should map between split name and list of instances.\n"
-                f"Received value: {self.data}\n"
-            )
-        for split in self.data.keys():
-            if len(self.data[split]) == 0:
-                raise ValueError(f"Split {split} has no instances.")
-            first_instance = self.data[split][0]
-            for instance in self.data[split]:
-                if instance.keys() != first_instance.keys():
-                    raise ValueError(
-                        f"Not all instances in split '{split}' have the same fields.\n"
-                        f"instance {instance} has different fields different from {first_instance}"
-                    )
     def _maybe_set_classification_policy(self):
         self.set_default_data_classification(
@@ -1127,7 +1153,7 @@ class LoadFromAPI(Loader):
     chunksize: int = 100000
     loader_limit: Optional[int] = None
     streaming: bool = False
-    api_key_env_var: Optional[str] = ""
     headers: Optional[Dict[str, Any]] = None
     data_field: str = "data"
     method: str = "GET"

 from .dataclass import NonPositionalField
 from .dict_utils import dict_get
+from .error_utils import Documentation, UnitxtError, UnitxtWarning, error_context
 from .fusion import FixedFusion
 from .logging_utils import get_logger
 from .operator import SourceOperator
 @retry_connection_with_exponential_backoff(backoff_factor=2)
 def hf_load_dataset(path: str, *args, **kwargs):
+    with error_context(
+        stage="Raw Dataset Download",
+        help="https://www.unitxt.ai/en/latest/unitxt.loaders.html#module-unitxt.loaders",
+    ):
+        if settings.hf_offline_datasets_path is not None:
+            path = os.path.join(settings.hf_offline_datasets_path, path)
+        try:
+            return _hf_load_dataset(
+                path,
+                *args,
+                **kwargs,
+                verification_mode="no_checks",
+                trust_remote_code=settings.allow_unverified_code,
+                download_mode="force_redownload"
+                if settings.disable_hf_datasets_cache
+                else "reuse_dataset_if_exists",
+            )
+        except ValueError as e:
+            if "trust_remote_code" in str(e):
+                raise UnitxtUnverifiedCodeError(path) from e
+            raise e  # Re raise
 @retry_connection_with_exponential_backoff(backoff_factor=2)
         pass
     def load_data(self) -> MultiStream:
+        with error_context(
+            self,
+            stage="Data Loading",
+            help="https://www.unitxt.ai/en/latest/unitxt.loaders.html#module-unitxt.loaders",
+        ):
             iterables = self.load_iterables()
+            if isoftype(iterables, MultiStream):
+                return iterables
+            return MultiStream.from_iterables(iterables, copying=True)
     def process(self) -> MultiStream:
         self._maybe_set_classification_policy()
     sep: str = ","
     def read_dataframe(self, file) -> pd.DataFrame:
+        with error_context(
+            stage="Raw Dataset Loading",
+            help="https://www.unitxt.ai/en/latest/unitxt.loaders.html#module-unitxt.loaders",
+        ):
+            return pd.read_csv(
+                file, sep=self.sep, low_memory=self.streaming, **self.get_args()
+            )
 def read_file(source) -> bytes:
     data_field: Optional[str] = None
     def read_dataframe(self, file) -> pd.DataFrame:
+        with error_context(
+            stage="Raw Dataset Loading",
+            help="https://www.unitxt.ai/en/latest/unitxt.loaders.html#module-unitxt.loaders",
+        ):
+            args = self.get_args()
+            if not self.lines:
+                data = json.loads(read_file(file))
+                if self.data_field:
+                    instances = dict_get(data, self.data_field)
+                    if not isoftype(instances, List[Dict[str, Any]]):
+                        raise UnitxtError(
+                            f"{self.data_field} of file {file} is not a list of dictionariess in LoadJsonFile loader"
+                        )
                 else:
+                    if isoftype(data, Dict[str, Any]):
+                        instances = [data]
+                    elif isoftype(data, List[Dict[str, Any]]):
+                        instances = data
+                    else:
+                        raise UnitxtError(
+                            f"data of file {file} is not dictionary or a list of dictionaries in LoadJsonFile loader"
+                        )
+                dataframe = pd.DataFrame(instances)
+            else:
+                if self.data_field is not None:
                     raise UnitxtError(
+                        "Can not load from a specific 'data_field' when loading multiple lines (lines=True)"
                     )
+                dataframe = pd.read_json(file, lines=self.lines, **args)
+            return dataframe
 class LoadFromSklearn(LazyLoader):
         dataset_id = str(self) + "_" + split
         dataset = self.__class__._loader_cache.get(dataset_id, None)
         if dataset is None:
+            with error_context(
+                stage="Raw Dataset Loading",
+                help="https://www.unitxt.ai/en/latest/unitxt.loaders.html#module-unitxt.loaders",
+            ):
+                split_data = self.downloader(subset=split)
+                targets = [split_data["target_names"][t] for t in split_data["target"]]
             df = pd.DataFrame([split_data["data"], targets]).T
             df.columns = ["data", "target"]
             dataset = df.to_dict("records")
                     if self.data_dir is not None
                     else data_file
                 )
+                with error_context(
+                    stage="Raw Dataset Download",
+                    help="https://www.unitxt.ai/en/latest/unitxt.loaders.html#module-unitxt.loaders",
+                ):
+                    with tempfile.NamedTemporaryFile() as temp_file:
+                        # Download to  a temporary file in same file partition, and then do an atomic move
+                        self._download_from_cos(
+                            cos,
+                            self.bucket_name,
+                            object_key,
+                            local_dir + "/" + os.path.basename(temp_file.name),
+                        )
+                        os.renames(
+                            local_dir + "/" + os.path.basename(temp_file.name),
+                            local_dir + "/" + data_file,
+                        )
         if isinstance(self.data_files, list):
             dataset = hf_load_dataset(local_dir, streaming=False, field=self.data_field)
     def verify(self):
         super().verify()
+        with error_context(
+            stage="Dataset Loading",
+            help="https://www.unitxt.ai/en/latest/unitxt.loaders.html#module-unitxt.loaders",
+        ):
+            if not isoftype(self.data, Dict[str, List[Dict[str, Any]]]):
+                raise ValueError(
+                    f"Passed data to LoadFromDictionary is not of type Dict[str, List[Dict[str, Any]]].\n"
+                    f"Expected data should map between split name and list of instances.\n"
+                    f"Received value: {self.data}\n"
+                )
+            for split in self.data.keys():
+                if len(self.data[split]) == 0:
+                    raise ValueError(f"Split {split} has no instances.")
+                first_instance = self.data[split][0]
+                for instance in self.data[split]:
+                    if instance.keys() != first_instance.keys():
+                        raise ValueError(
+                            f"Not all instances in split '{split}' have the same fields.\n"
+                            f"instance {instance} has different fields different from {first_instance}"
+                        )
     def _maybe_set_classification_policy(self):
         self.set_default_data_classification(
     chunksize: int = 100000
     loader_limit: Optional[int] = None
     streaming: bool = False
+    api_key_env_var: Optional[str] = None
     headers: Optional[Dict[str, Any]] = None
     data_field: str = "data"
     method: str = "GET"

metric.py CHANGED Viewed

@@ -56,7 +56,6 @@ from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
 from .splitters import __file__ as _
-from .sql_utils import __file__ as _
 from .standard import __file__ as _
 from .stream import __file__ as _
 from .stream_operators import __file__ as _
@@ -65,6 +64,7 @@ from .struct_data_operators import __file__ as _
 from .system_prompts import __file__ as _
 from .task import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
 from .types import __file__ as _

 from .span_lableing_operators import __file__ as _
 from .split_utils import __file__ as _
 from .splitters import __file__ as _
 from .standard import __file__ as _
 from .stream import __file__ as _
 from .stream_operators import __file__ as _
 from .system_prompts import __file__ as _
 from .task import __file__ as _
 from .templates import __file__ as _
+from .text2sql_utils import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
 from .types import __file__ as _

metric_utils.py CHANGED Viewed

@@ -9,7 +9,7 @@ import pandas as pd
 from datasets import Features, Value
 from .dataclass import Dataclass
-from .error_utils import Documentation, UnitxtError
 from .operator import (
     InstanceOperator,
     MultiStreamOperator,
@@ -36,6 +36,9 @@ from .utils import recursive_copy
 constants = get_constants()
 def nan_mean(scores):
     result = mean(score for score in scores if score == score)
@@ -56,7 +59,10 @@ class FromPredictionsAndOriginalData(StreamInitializerOperator):
             yield {**original, "prediction": prediction}
     def process(
-        self, predictions: List[str], references: Iterable, split_name: str = "all"
     ) -> MultiStream:
         return MultiStream(
             {
@@ -152,7 +158,7 @@ class SplitSubsetsAndGroups(MultiStreamOperator):
                 subset_stream_name = (
                     stream_name
-                    + "://"
                     + "/".join(instance[self.subsets_field][: self.subset_depth])
                 )
@@ -190,7 +196,7 @@ def group_str_to_key_value(group_str):
 @lru_cache(maxsize=None)
 def stream_name_to_origin_subset_group(stream_name):
-    origin, subset_group = stream_name.split("://")
     if "?" in subset_group:
         subset, group = subset_group.split("?")
     else:
@@ -734,22 +740,23 @@ def _compute(
     predictions: List[Any],
     references: Iterable,
     flatten: bool = False,
-    split_name: str = "all",
     calc_confidence_intervals: bool = True,
 ):
     _reset_env_local_catalogs()
     register_all_artifacts()
     recipe = MetricRecipe(calc_confidence_intervals=calc_confidence_intervals)
-    multi_stream = recipe(
-        predictions=predictions, references=references, split_name=split_name
-    )
-    if flatten:
-        operator = FlattenInstances()
-        multi_stream = operator(multi_stream)
-    stream = multi_stream[split_name]
     return EvaluationResults(stream)

 from datasets import Features, Value
 from .dataclass import Dataclass
+from .error_utils import Documentation, UnitxtError, error_context
 from .operator import (
     InstanceOperator,
     MultiStreamOperator,
 constants = get_constants()
+DEFAULT_STREAM_NAME = "all_data"
+DEFAULT_STREAM_SUBSET_SEPARATOR = ">>"
 def nan_mean(scores):
     result = mean(score for score in scores if score == score)
             yield {**original, "prediction": prediction}
     def process(
+        self,
+        predictions: List[str],
+        references: Iterable,
+        split_name: str = DEFAULT_STREAM_NAME,
     ) -> MultiStream:
         return MultiStream(
             {
                 subset_stream_name = (
                     stream_name
+                    + DEFAULT_STREAM_SUBSET_SEPARATOR
                     + "/".join(instance[self.subsets_field][: self.subset_depth])
                 )
 @lru_cache(maxsize=None)
 def stream_name_to_origin_subset_group(stream_name):
+    origin, subset_group = stream_name.split(DEFAULT_STREAM_SUBSET_SEPARATOR)
     if "?" in subset_group:
         subset, group = subset_group.split("?")
     else:
     predictions: List[Any],
     references: Iterable,
     flatten: bool = False,
+    split_name: str = DEFAULT_STREAM_NAME,
     calc_confidence_intervals: bool = True,
 ):
     _reset_env_local_catalogs()
     register_all_artifacts()
     recipe = MetricRecipe(calc_confidence_intervals=calc_confidence_intervals)
+    with error_context(stage="Metric Processing"):
+        multi_stream = recipe(
+            predictions=predictions, references=references, split_name=split_name
+        )
+        if flatten:
+            operator = FlattenInstances()
+            multi_stream = operator(multi_stream)
+        stream = multi_stream[split_name]
     return EvaluationResults(stream)

metrics.py CHANGED Viewed

@@ -8,7 +8,8 @@ import uuid
 import warnings
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
-from dataclasses import field
 from enum import Enum
 from functools import lru_cache
 from typing import (
@@ -42,7 +43,8 @@ from .dataclass import (
     OptionalField,
 )
 from .deprecation_utils import deprecation
-from .error_utils import Documentation, UnitxtError, UnitxtWarning
 from .inference import (
     HFPipelineBasedInferenceEngine,
     InferenceEngine,
@@ -64,6 +66,7 @@ from .operators import ArtifactFetcherMixin, Copy, FieldOperator, Set
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
 from .type_utils import isoftype, parse_type_string, to_type_string
 from .types import ToolCall
 from .utils import deep_copy, recursive_copy, retry_connection_with_exponential_backoff
@@ -382,28 +385,35 @@ class MapReduceMetric(
         return intermediates
     def process(self, stream: Stream, stream_name: Optional[str] = None):
-        instances_scores, global_scores = self.compute(stream, stream_name)
-        for i, (instance, instance_scores) in enumerate(zip(stream, instances_scores)):
-            previous_score = instance.get("score", {"global": {}, "instance": {}})
-            if i == 0:
-                for key in global_scores:
-                    if is_original_key(key) and key in previous_score["global"]:
-                        UnitxtWarning(
-                            message=f"Metric '{key}' that has just been evaluated with value {global_scores[key]}, is already recorded "
-                            f"to have value {previous_score['global'][key]} by a previous metric evaluation on this instance or stream. "
-                            f"To avoid overwriting the existing value, add a score_prefix to the metric name (e.g. score_prefix='my_second_' , "
-                            f"which will yield, in this case, a score named: 'my_second_{key}')",
-                            additional_info_id=Documentation.MULTIPLE_METRICS_OUTPUTS,
-                        )
-            global_scores = {**previous_score["global"], **global_scores}
-            instance_scores = {**previous_score["instance"], **instance_scores}
-            yield {
-                **instance,
-                "score": {"global": global_scores, "instance": instance_scores},
-            }
     def compute(self, stream: Stream, stream_name: Optional[str] = None):
         evaluation_inputs_stream = self._instances_stream_to_evaluation_inputs(stream)
@@ -453,6 +463,43 @@ class DictReduction(AggregationReduction[Dict[str, float]]):
         return result
 class MeanReduction(DictReduction):
     def reduce_list(self, lst: List[float]):
         return nan_mean(lst)
@@ -468,6 +515,91 @@ class MaxReduction(DictReduction):
         return float(nan_max(lst))
 class ReductionInstanceMetric(
     MapReduceMetric[PredictionType, IntermediateType],
     Generic[PredictionType, IntermediateType],
@@ -704,6 +836,52 @@ class ToolCallingMetric(ReductionInstanceMetric[str, Dict[str, float]]):
         }
 class MetricWithConfidenceInterval(Metric):
     # The number of resamples used to estimate the confidence intervals of this metric.
     # Use None to disable confidence interval computation.
@@ -954,83 +1132,88 @@ class GlobalMetric(StreamOperator, MetricWithConfidenceInterval):
     process_single_instances = True
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
-        references = []
-        predictions = []
-        task_data = []
-        instances = []
-        for instance in stream:
-            instance = self.verify_instance(instance)
-            if "score" not in instance:
-                instance["score"] = {"global": {}, "instance": {}}
-            instance_references, instance_prediction = (
-                instance["references"],
-                instance["prediction"],
-            )
-            references.append(instance_references)
-            predictions.append(instance_prediction)
-            instances.append(instance)
-            instance_task_data = (
-                instance["task_data"] if "task_data" in instance else {}
-            )
-            task_data.append(instance_task_data)
-            instance_score = None
-            # for backward compatibility
-            no_score_value = np.nan
-            if self.process_single_instances:
-                try:
-                    instance_score = self._compute(
-                        [instance_references],
-                        [instance_prediction],
-                        [instance_task_data],
-                    )
-                except:
-                    no_score_value = None
-            if not instance_score:
-                instance_score = {
-                    "score": no_score_value,
-                    "score_name": self.main_score,
-                }
-                if isinstance(self.main_score, str):
-                    instance_score[self.main_score] = no_score_value
-            instance["score"]["instance"].update(
-                self._add_score_prefixes_to_score_dict_and_check_against_existing_scores(
-                    instance_score, instance["score"]["instance"]
                 )
-            )
-        self._validate_references_and_prediction(references, predictions)
-        global_score = {"num_of_instances": len(instances)}
-        result = self._compute(references, predictions, task_data)
-        global_score.update(
-            self._add_score_prefixes_to_score_dict_and_check_against_existing_scores(
-                result, global_score
             )
-        )
-        if self.ci_scores:
-            score_names = [
-                self._add_score_prefix(score_name) for score_name in self.ci_scores
-            ]
-        else:
-            score_names = [global_score["score_name"]]
-        for score_name in score_names:
-            confidence_interval = self.compute_global_confidence_intervals(
-                references, predictions, task_data, score_name
-            )
-            global_score.update(confidence_interval)
-        for instance in instances:
-            self.update_and_adjust_global_score(instance, global_score)
-            yield instance
     def _compute(
         self,
@@ -1080,96 +1263,105 @@ class BulkInstanceMetric(StreamOperator, MetricWithConfidenceInterval):
         return instance
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
-        instances = []
-        for instance in stream:
-            self.verify_instance(instance)
-            instance = self.preprocess_instance(instance)
-            instances.append(instance)
-        predictions = [instance["prediction"] for instance in instances]
-        references = [instance["references"] for instance in instances]
-        task_data = [
-            instance["task_data"] if "task_data" in instance else {}
-            for instance in instances
-        ]
-        self._validate_references_and_prediction(references, predictions)
-        global_score = {"num_of_instances": len(instances)}
-        # compute the metric over all refs and preds
-        instance_scores = self.compute(
-            references=references,
-            predictions=predictions,
-            task_data=task_data,
-        )
-        # add the score and score_name fields
-        for instance_score in instance_scores:
-            instance_score["score"] = instance_score[self.main_score]
-            instance_score["score_name"] = self.main_score
-        for instance, score in zip(instances, instance_scores):
-            if "score" not in instance:
-                instance["score"] = {"global": {}, "instance": {}}
-            instance["score"]["instance"].update(
-                self._add_score_prefixes_to_score_dict_and_check_against_existing_scores(
-                    score, instance["score"]["instance"]
                 )
-            )
-        for reduction, fields in self.reduction_map.items():
-            assert (
-                reduction in self.implemented_reductions
-            ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
-            if reduction == "mean":
-                for field_name in fields:
-                    field_name_with_prefix = self._add_score_prefix(field_name)
-                    global_score[field_name_with_prefix] = nan_mean(
-                        [
-                            instance["score"]["instance"][field_name_with_prefix]
-                            for instance in instances
-                        ]
-                    )
-                    if field_name == self.main_score:
-                        global_score["score"] = global_score[field_name_with_prefix]
-                        global_score["score_name"] = self.score_prefix + self.main_score
-                ci_fields = (
-                    list(set(self.ci_scores))
-                    if self.ci_scores is not None
-                    else [self.main_score]
-                )
-                ci_fields_with_prefix = [
-                    self._add_score_prefix(ci_field) for ci_field in ci_fields
-                ]
-                confidence_interval = self.score_based_confidence_interval(
-                    instances=instances, score_names=ci_fields_with_prefix
-                )
-                global_score.update(confidence_interval)
-            if reduction == "weighted_win_rate":
-                for field_name in fields:
-                    field_name_with_prefix = self._add_score_prefix(field_name)
-                    total_battles = 0
-                    wins = 0
-                    for instance in instances:
-                        s = instance["score"]["instance"][field_name_with_prefix]
-                        if s > 0:
-                            total_battles += s
-                            wins += s
-                        elif s < 0:
-                            total_battles += abs(s)
-                        else:
-                            total_battles += 2
-                            wins += 1
-                    global_score[field_name_with_prefix] = wins / total_battles
-                    if field_name == self.main_score:
-                        global_score["score"] = global_score[field_name_with_prefix]
-                        global_score["score_name"] = self.score_prefix + self.main_score
-        for instance in instances:
-            self.update_and_adjust_global_score(instance, global_score)
-            yield instance
     @abstractmethod
     def compute(
@@ -1475,91 +1667,97 @@ class InstanceMetric(StreamOperator, MetricWithConfidenceInterval):
             assert isinstance(fields["score_fields"], list)
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
-        instance_scores = self.compute_instance_scores(stream)
-        global_score = {"num_of_instances": len(instance_scores)}
-        for reduction_type, reduction_params in self.reduction_map.items():
-            assert (
-                reduction_type in self.implemented_reductions
-            ), f"Reduction {reduction_type} is not implemented, use one of {self.implemented_reductions}"
-            field_name_full_prefix = ""
-            # used for passing to the bootstrapping, depends on whether the groups are fixed or not
-            aggregation_function = None
-            if reduction_type == "mean":
-                aggregation_function = self.average_item_scores
-                reduction_fields = list(set(reduction_params))
-                # no group reduction, so resample instances individually
-                scores_to_resample = instance_scores
-            elif reduction_type == "max":
-                aggregation_function = self.max_item_scores
-                reduction_fields = list(set(reduction_params))
-                # no group reduction, so resample instances individually
-                scores_to_resample = instance_scores
-            elif reduction_type == "group_mean":
-                aggregation_function = self.average_item_scores
-                self._validate_group_mean_reduction()
-                reduction_fields = (
-                    [self.main_score]
-                    if "score_fields" not in reduction_params
-                    else list(set(reduction_params["score_fields"]))
-                )
-                aggregation_function_name = str(reduction_params["agg_func"][0])
-                field_name_full_prefix = "group_" + aggregation_function_name + "_"
-                do_resample_as_group = reduction_params["agg_func"][2]
-                if do_resample_as_group:
-                    # append fixed_ to name because resamples the groups as fixed units
-                    field_name_full_prefix = "fixed_" + field_name_full_prefix
-                (
-                    scores_to_resample,
-                    aggregation_function,
-                ) = self._set_up_group_mean_aggregation(
-                    instance_scores,
-                    reduction_params,
-                    reduction_fields,
-                )
-            else:
-                raise ValueError(
-                    f"Reduction {reduction_type} is not supported, please specify a valid reduction method in reduction_map {self.reduction_map}."
-                )
-            # calculate global scores for each reduction field
-            for field_name in reduction_fields:
-                field_name_full = (
-                    field_name_full_prefix + self.score_prefix + field_name
-                )
-                # if group resampling (3rd element of agg_func parameter) is True, then
-                #   1. scores_to_resample are the group scores, and
-                #   2. aggregation_function is to take the raw mean
-                # if no group resampling (3rd element of agg_func parameter) is False, then
-                #   1. scores_to_resample are the original instance scores, and
-                #   2. aggregation_function is to apply the group aggregation from the instance scores
-                # either way, the application of aggregation_function to scores_to_resample yields the global score
-                global_score[field_name_full] = aggregation_function(
-                    scores_to_resample, self.score_prefix + field_name
-                )
-                if field_name == self.main_score:
-                    global_score["score"] = global_score[field_name_full]
-                    global_score["score_name"] = field_name_full
-            # need to specify which fields should have CIs calculated for them through ci_scores
-            # (will not automatically calculate CIs for fields in reduction map)
-            if self.ci_scores is not None:
-                confidence_interval = self.score_based_confidence_interval(
-                    instances=scores_to_resample,
-                    score_names=[
-                        self.score_prefix + ci_score for ci_score in set(self.ci_scores)
-                    ],
-                    ci_score_prefix=field_name_full_prefix,
-                    aggregation_func=aggregation_function,
-                )
-                global_score.update(confidence_interval)
-        for instance in instance_scores:
-            self.update_and_adjust_global_score(instance, global_score)
-        for i, instance in enumerate(stream):
-            instance["score"] = recursive_copy(instance_scores[i]["score"])
-            yield instance
     def compute_instance_scores(
         self, stream: Stream, stream_name: Optional[str] = None
@@ -6436,391 +6634,102 @@ RISK_TYPE_TO_CLASS: Dict[RiskType, GraniteGuardianBase] = {
 }
-class SQLExecutionAccuracy(InstanceMetric):
-    reduction_map = {
-        "mean": [
-            "execution_accuracy",
-            "non_empty_execution_accuracy",
-            "subset_non_empty_execution_result",
-            "non_empty_gold_df",
-            "gold_sql_runtime",
-            "predicted_sql_runtime",
-            "pred_to_gold_runtime_ratio",
-            "gold_error",
-            "predicted_error",
-        ]
-    }
     main_score = "non_empty_execution_accuracy"
     ci_scores = [
         "execution_accuracy",
         "non_empty_execution_accuracy",
-        "subset_non_empty_execution_result",
         "gold_sql_runtime",
         "predicted_sql_runtime",
     ]
-    prediction_type = "Any"  # string representation is compared
-    sql_timeout = 30.0
-    _requirements_list = ["sqlglot", "func_timeout"]
-    @staticmethod
-    def compare_dfs_ignore_colnames_ordered_rows(df1, df2):
-        """Compares two DataFrames based on row content, ignoring column names.
-        Args:
-            df1 (pd.DataFrame): Pandas DataFrame 1 to compare.
-            df2 (pd.DataFrame): Pandas DataFrame 2 to compare.
-        Returns:
-            True if the DataFrames have the same ordered rows (ignoring column names),
-            False otherwise.
-        """
-        df1.fillna(0, inplace=True)
-        df2.fillna(0, inplace=True)
-        # Compare row counts first for a quick check
-        if df1.shape != df2.shape:
-            return False
-        # Convert DataFrames to numpy arrays of strings to handle mixed types
-        df1_array = df1.values.astype(str)
-        df2_array = df2.values.astype(str)
-        # Sort each row's elements (column order independence)
-        df1_sorted_rows = np.array([np.sort(row) for row in df1_array])
-        df2_sorted_rows = np.array([np.sort(row) for row in df2_array])
-        # Compare the sorted rows in order
-        return np.array_equal(df1_sorted_rows, df2_sorted_rows)
-    @staticmethod
-    def compare_dfs_ignore_colnames_unordered_rows(df1, df2):
-        """Compares two DataFrames based on row content, ignoring row order and column names.
-        Args:
-            df1 (pd.DataFrame): Pandas DataFrame 1 to compare.
-            df2 (pd.DataFrame): Pandas DataFrame 2 to compare.
-        Returns:
-            True if the DataFrames have the same content (ignoring column names and row order),
-            False otherwise.
-        """
-        # Compare shapes early on
-        if df1.shape != df2.shape:
-            return False
-        # Convert DataFrames to numpy arrays of strings (to handle mixed data types)
-        df1_array = df1.values.astype(str)
-        df2_array = df2.values.astype(str)
-        # Sort columns first, then sort rows
-        df1_sorted = np.sort(np.sort(df1_array, axis=1), axis=0)
-        df2_sorted = np.sort(np.sort(df2_array, axis=1), axis=0)
-        # Compare the sorted arrays
-        return np.array_equal(df1_sorted, df2_sorted)
-    @staticmethod
-    def compare_dfs_ignore_colnames_subset(df1, df2, ignore_row_order=True):
-        """Checks if the values of either DataFrame are a subset of the values in the other DataFrame.
-        Comparison is column order independent, and could optionally be row order independent.
-        We interpret "subset" as follows:
-        - For each row in df1, there must be a matching (or superset) row in df2, i.e. the set of values
-          in the df1 row is a subset of the set of values in that df2 row. Then do the same check in reverse.
-        - If either condition (df1 is subset of df2 OR df2 is subset of df1) is satisfied, return True.
-        We treat an empty dataframe as a subset of nothing, while in theory is a subset of any dataframe.
-        Args:
-            df1 (pd.DataFrame): Pandas DataFrame 1 to compare.
-            df2 (pd.DataFrame): Pandas DataFrame 2 to compare.
-            ignore_row_order (bool): If True, row order doesn't matter; if False, row order is respected.
-        Returns:
-            bool: True if df1 is a subset of df2 or vice versa, based on the specified row-order condition.
-        """
-        df1_array = df1.values.astype(str)
-        df2_array = df2.values.astype(str)
-        df1_sorted_rows = [np.sort(row) for row in df1_array]
-        df2_sorted_rows = [np.sort(row) for row in df2_array]
-        def row_is_subset(r_small, r_big):
-            """Check if all elements of r_small are in r_big."""
-            return set(r_small).issubset(set(r_big))
-        def df_is_subset_of_another(rows_small, rows_big, respect_order):
-            """Check if the rows_small is subset of rows_big under the given order condition."""
-            if not rows_small:
-                return False  # DataFrame needs to be non-empty
-            # If row order matters:
-            if respect_order:
-                i, j = 0, 0
-                while i < len(rows_small) and j < len(rows_big):
-                    if row_is_subset(rows_small[i], rows_big[j]):
-                        i += 1
-                    j += 1
-                return i == len(rows_small)
-            # Row order doesn't matter:
-            matched_indices = set()
-            for r_small in rows_small:
-                found_match = False
-                for idx, r_big in enumerate(rows_big):
-                    if idx not in matched_indices and row_is_subset(r_small, r_big):
-                        found_match = True
-                        matched_indices.add(idx)
-                        break
-                if not found_match:
-                    return False
-            return True
-        df1_sub_df2 = df_is_subset_of_another(
-            df1_sorted_rows, df2_sorted_rows, not ignore_row_order
-        )
-        df2_sub_df1 = df_is_subset_of_another(
-            df2_sorted_rows, df1_sorted_rows, not ignore_row_order
         )
-        return df1_sub_df2 or df2_sub_df1
-    def get_sql_execution_results(
-        self, predicted_sql: str, gold_sql: str, connector
-    ) -> (int, int, int, int, int, int, int, int, int, str, str, str):
-        """Runs SQL queries using the provided connector and gets scores and results.
-        Args:
-            predicted_sql (str): predicted SQL query
-            gold_sql (str): gold reference SQL query
-            connector: database connector
-        Returns:
-        a 12-tuple of
-        1. execution_result: if df responses match
-        2. non_empty_execution_result: if dfs are non-empty and match
-        3. subset_non_empty_execution_result: if non-empty dfs and one is a subset of the other
-        4. non_empty_gold_df: if gt df is non-empty
-        5. gold_sql_runtime: ground truth query runtime
-        6. predicted_sql_runtime: predicted query runtime
-        7. pred_to_gold_runtime_ratio: ratio of predicted query runtime to gt query runtime
-        8. gold_error: if gt has an error
-        9. predicted_error: if predicted query has an error
-        10. ground truth dataframe
-        11. predicted query's dataframe
-        12. error message (if any)
-        """
-        import time
-        from func_timeout import func_timeout
-        from func_timeout.exceptions import FunctionTimedOut
-        from .sql_utils import sqlglot_optimized_equivalence
-        gold_res = None
-        gold_error = ""
-        gold_sql_runtime = 0
-        try:
-            start_time = time.perf_counter()
-            gold_res, gold_error = func_timeout(
-                self.sql_timeout,
-                connector.execute_query,
-                args=(gold_sql,),
-            )
-            end_time = time.perf_counter()
-            gold_sql_runtime = end_time - start_time
-        except FunctionTimedOut as e:
-            pred_error = f"Timeout error executing gold SQL: {e}"
-            logger.warning(pred_error)
-        except Exception as e:
-            gold_error = f"Error executing gold SQL: {e}"
-        if gold_error is not None:
-            return (
-                0,
-                0,
-                0,
-                0,
-                gold_sql_runtime,
-                0,
-                0,
-                0,
-                0,
-                "",
-                "",
-                gold_error,
-            )
-        if isinstance(gold_res, dict) and "results" in gold_res:
-            gold_res = gold_res["results"]
-        gold_df = pd.DataFrame(gold_res)
-        non_empty_gold_df = 0 if gold_df.empty else 1
-        no_execution_match_result = (
-            1,
-            non_empty_gold_df,
-            non_empty_gold_df,
-            non_empty_gold_df,
-            gold_sql_runtime,
-            0,
-            0,
-            0,
-            0,
-            gold_df.to_json(),
-            "",
-            "",
-        )
-        if predicted_sql.lower().strip() == gold_sql.lower().strip():
-            return no_execution_match_result
-        try:
-            if sqlglot_optimized_equivalence(gold_sql, predicted_sql):
-                return no_execution_match_result
-        except Exception as e:  # Catch specific exceptions if possible
-            logger.info(
-                f"Couldn't test equivalent_sqls: {e}. Treating as non-equivalent and going to test with the db."
-            )
-        pred_res = None
-        pred_error = ""
-        pred_sql_runtime = 0
-        try:
-            start_time = time.perf_counter()
-            pred_res, pred_error = func_timeout(
-                self.sql_timeout,
-                connector.execute_query,
-                args=(predicted_sql,),
-            )
-            end_time = time.perf_counter()
-            pred_sql_runtime = end_time - start_time
-        except FunctionTimedOut as e:
-            pred_error = f"Timeout error executing predicted SQL: {e}"
-            logger.info(pred_error)
-        except Exception as e:
-            pred_error = f"Error executing predicted SQL: {e}"
-            logger.info(pred_error)
-        pred_to_gold_runtime_ratio = (
-            float(pred_sql_runtime) / gold_sql_runtime if gold_sql_runtime > 0 else 0
         )
-        if pred_res is None:
-            return (
-                0,
-                0,
-                0,
-                0,
-                gold_sql_runtime,
-                pred_sql_runtime,
-                pred_to_gold_runtime_ratio,
-                0,
-                1,
-                "",
-                "",
-                pred_error,
-            )
-        if isinstance(pred_res, dict) and "results" in pred_res:
-            pred_res = pred_res["results"]
-        predicted_df = pd.DataFrame(pred_res)
-        subset_non_empty_execution_result = 0
-        non_empty_execution_result = 0
-        if "ORDER BY" in gold_sql.upper():
-            execution_result = (
-                1
-                if self.compare_dfs_ignore_colnames_ordered_rows(predicted_df, gold_df)
-                else 0
-            )
-            if non_empty_gold_df:
-                if execution_result == 1:
-                    non_empty_execution_result = 1
-                if self.compare_dfs_ignore_colnames_subset(
-                    gold_df, predicted_df, ignore_row_order=False
-                ):
-                    subset_non_empty_execution_result = 1
-        else:
-            execution_result = (
-                1
-                if self.compare_dfs_ignore_colnames_unordered_rows(
-                    predicted_df, gold_df
-                )
-                else 0
-            )
-            if non_empty_gold_df:
-                if execution_result == 1:
-                    non_empty_execution_result = 1
-                if self.compare_dfs_ignore_colnames_subset(
-                    gold_df, predicted_df, ignore_row_order=True
-                ):
-                    subset_non_empty_execution_result = 1
-        return (
-            execution_result,
-            non_empty_execution_result,
-            subset_non_empty_execution_result,
-            non_empty_gold_df,
-            gold_sql_runtime,
-            pred_sql_runtime,
-            pred_to_gold_runtime_ratio,
-            0,
-            0,
-            gold_df.to_json(),
-            predicted_df.to_json(),
-            pred_error,
         )
-    def compute(self, references: List[Any], prediction: str, task_data: Dict) -> dict:
-        from .sql_utils import get_db_connector
-        predicted_sql = prediction
-        execution_result: float = 0.0
-        if predicted_sql and predicted_sql.strip() != "":
-            if not predicted_sql.startswith("SELECT") and "SELECT" in predicted_sql:
-                predicted_sql = predicted_sql[predicted_sql.find("SELECT") :]
-            if ";" in predicted_sql:
-                predicted_sql = predicted_sql[: predicted_sql.find(";") + 1]
-            db_connector = get_db_connector(task_data["db"]["db_type"])(task_data["db"])
-            logger.debug(
-                f"Starting to get SQL execution results over DB: {task_data['db']}"
-            )
-            (
-                execution_result,
-                non_empty_execution_result,
-                subset_non_empty_execution_result,
-                non_empty_gold_df,
-                gold_sql_runtime,
-                predicted_sql_runtime,
-                pred_to_gold_runtime_ratio,
-                gold_error,
-                predicted_error,
-                gold_df_json,
-                predicted_df_json,
-                error_message,
-            ) = self.get_sql_execution_results(
-                predicted_sql, references[0], db_connector
-            )
-        result = {
-            "execution_accuracy": float(execution_result),
-            "non_empty_execution_accuracy": float(non_empty_execution_result),
-            "subset_non_empty_execution_result": float(
-                subset_non_empty_execution_result
-            ),
-            "non_empty_gold_df": float(non_empty_gold_df),
-            "gold_sql_runtime": float(gold_sql_runtime),
-            "predicted_sql_runtime": float(predicted_sql_runtime),
-            "pred_to_gold_runtime_ratio": float(pred_to_gold_runtime_ratio),
-            "gold_error": float(gold_error),
-            "predicted_error": float(predicted_error),
-            "error_message": str(error_message),
-            "gold_df_json": str(gold_df_json),
-            "predicted_df_json": str(predicted_df_json),
-        }
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score
         logger.debug(f"SQL Execution Accuracy Result: {result}")
@@ -6828,34 +6737,22 @@ class SQLExecutionAccuracy(InstanceMetric):
 class SQLNonExecutionAccuracy(InstanceMetric):
-    reduction_map = {
-        "mean": [
-            "sqlglot_validity",
-            "sqlparse_validity",
-            "sqlglot_equivalence",
-            "sqlglot_optimized_equivalence",
-            "sqlparse_equivalence",
-            "sql_exact_match",
-            "sql_syntactic_equivalence",
-        ]
-    }
-    main_score = "sqlglot_equivalence"
-    ci_scores = [
-        "sqlglot_validity",
-        "sqlparse_validity",
-        "sqlglot_equivalence",
-        "sqlglot_optimized_equivalence",
-        "sqlparse_equivalence",
-        "sql_exact_match",
-        "sql_syntactic_equivalence",
     ]
     prediction_type = "Any"  # string representation is compared
     _requirements_list = ["sqlglot", "sqlparse"]
     def compute(self, references: List[Any], prediction: str, task_data: Dict) -> dict:
-        from .sql_utils import (
             is_sqlglot_parsable,
             is_sqlparse_parsable,
             sql_exact_match,
@@ -6864,48 +6761,45 @@ class SQLNonExecutionAccuracy(InstanceMetric):
             sqlparse_queries_equivalent,
         )
-        predicted_sql = prediction
         gold_sql = references[0]
-        if predicted_sql and predicted_sql.strip() != "":
-            if not predicted_sql.startswith("SELECT") and "SELECT" in predicted_sql:
-                predicted_sql = predicted_sql[predicted_sql.find("SELECT") :]
-            if ";" in predicted_sql:
-                predicted_sql = predicted_sql[: predicted_sql.find(";") + 1]
         is_sqlglot_parsable = is_sqlglot_parsable(predicted_sql)
         is_sqlparse_parsable = is_sqlparse_parsable(predicted_sql)
-        result = {
-            "sqlglot_validity": float(is_sqlglot_parsable),
-            "sqlparse_validity": float(is_sqlparse_parsable),
-            "sqlglot_equivalence": float(
                 sqlglot_parsed_queries_equivalent(predicted_sql, gold_sql)
                 if is_sqlglot_parsable
                 else 0
             ),
-            "sqlglot_optimized_equivalence": float(
                 sqlglot_optimized_equivalence(predicted_sql, gold_sql)
                 if is_sqlglot_parsable
                 else 0
             ),
-            "sqlparse_equivalence": float(
                 sqlparse_queries_equivalent(predicted_sql, gold_sql)
                 if is_sqlparse_parsable
                 else 0
             ),
-            "sql_exact_match": float(sql_exact_match(predicted_sql, gold_sql)),
-        }
-        result["sql_syntactic_equivalence"] = float(
             any(
-                result[key]
-                for key in [
-                    "sqlglot_equivalence",
-                    "sqlglot_optimized_equivalence",
-                    "sqlparse_equivalence",
-                    "sql_exact_match",
                 ]
             )
         )
         logger.debug(f"SQL Non Execution Accuracy Result: {result}")
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score

 import warnings
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
+from dataclasses import asdict, field
+from dataclasses import fields as dataclasses_fields
 from enum import Enum
 from functools import lru_cache
 from typing import (
     OptionalField,
 )
 from .deprecation_utils import deprecation
+from .dict_utils import dict_get
+from .error_utils import Documentation, UnitxtError, UnitxtWarning, error_context
 from .inference import (
     HFPipelineBasedInferenceEngine,
     InferenceEngine,
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
+from .text2sql_utils import SQLExecutionResult, SQLNonExecutionMetricResult
 from .type_utils import isoftype, parse_type_string, to_type_string
 from .types import ToolCall
 from .utils import deep_copy, recursive_copy, retry_connection_with_exponential_backoff
         return intermediates
     def process(self, stream: Stream, stream_name: Optional[str] = None):
+        with error_context(
+            self,
+            stage="Evaluating Metric",
+            help="https://www.unitxt.ai/en/latest/docs/adding_metric.html",
+        ):
+            instances_scores, global_scores = self.compute(stream, stream_name)
+            for i, (instance, instance_scores) in enumerate(
+                zip(stream, instances_scores)
+            ):
+                previous_score = instance.get("score", {"global": {}, "instance": {}})
+                if i == 0:
+                    for key in global_scores:
+                        if is_original_key(key) and key in previous_score["global"]:
+                            UnitxtWarning(
+                                message=f"Metric '{key}' that has just been evaluated with value {global_scores[key]}, is already recorded "
+                                f"to have value {previous_score['global'][key]} by a previous metric evaluation on this instance or stream. "
+                                f"To avoid overwriting the existing value, add a score_prefix to the metric name (e.g. score_prefix='my_second_' , "
+                                f"which will yield, in this case, a score named: 'my_second_{key}')",
+                                additional_info_id=Documentation.MULTIPLE_METRICS_OUTPUTS,
+                            )
+                global_scores = {**previous_score["global"], **global_scores}
+                instance_scores = {**previous_score["instance"], **instance_scores}
+                yield {
+                    **instance,
+                    "score": {"global": global_scores, "instance": instance_scores},
+                }
     def compute(self, stream: Stream, stream_name: Optional[str] = None):
         evaluation_inputs_stream = self._instances_stream_to_evaluation_inputs(stream)
         return result
+class GroupReduction(AggregationReduction[Tuple[str, Dict[str, float]]]):
+    def reduce_list(self, lst: List[Tuple[str, float]]):
+        pass
+    def reduce(self, intermidates: Tuple[str, Dict[str, float]]):
+        lists = {}
+        for id, intermidate in intermidates:
+            for key, val in intermidate.items():
+                if key not in lists:
+                    lists[key] = []
+                lists[key].append((id, val))
+        result = {}
+        for key, val_list in lists.items():
+            result[key] = self.reduce_list(val_list)
+        return result
+class GroupMean(GroupReduction):
+    def reduce_list(self, lst: List[Tuple[str, float]]):
+        return nan_mean([item[1] for item in lst])
+class SequentialSuccess(GroupReduction):
+    threshold: float = 0.5
+    def reduce_list(self, lst: List[Tuple[str, float]]):
+        sorted_items = [item for _, item in sorted(lst, key=lambda x: x[0])]
+        successful = 0
+        for item in sorted_items:
+            if item > self.threshold:
+                successful += 1
+            else:
+                break
+        return successful / len(lst)
 class MeanReduction(DictReduction):
     def reduce_list(self, lst: List[float]):
         return nan_mean(lst)
         return float(nan_max(lst))
+class GroupMetric(
+    MapReduceMetric[PredictionType, IntermediateType],
+    Generic[PredictionType, IntermediateType],
+):
+    main_score: str = None
+    metric: MapReduceMetric[PredictionType, IntermediateType]
+    group_id_field: str
+    item_id_field: str
+    in_group_reduction: GroupReduction = GroupMean()
+    cross_group_reduction: GroupReduction = GroupMean()
+    n_resamples = None
+    def _get_group_id(self, task_data) -> str:
+        return str(dict_get(task_data, self.group_id_field))
+    def _get_item_id(self, task_data) -> str:
+        return str(dict_get(task_data, self.item_id_field))
+    def prepare(self):
+        super().prepare()
+        self.main_score = self.metric.main_score
+    def map_stream(
+        self,
+        evaluation_inputs_stream: Generator[
+            EvaluationInput[PredictionType], None, None
+        ],
+    ) -> List[Tuple[IntermediateType, str, str]]:
+        group_ids: List[str] = []
+        item_ids: List[str] = []
+        def multi_turn_stream(
+            evaluation_inputs_stream: Generator[
+                EvaluationInput[PredictionType], None, None
+            ],
+        ) -> Generator[
+            Tuple[PredictionType, List[PredictionType], Dict[str, Any]], None, None
+        ]:
+            for prediction, references, task_data in evaluation_inputs_stream:
+                group_ids.append(self._get_group_id(task_data))
+                item_ids.append(self._get_item_id(task_data))
+                yield prediction, references, task_data
+        intermediates: List[IntermediateType] = list(
+            self.metric.map_stream(multi_turn_stream(evaluation_inputs_stream))
+        )
+        return list(zip(intermediates, group_ids, item_ids))
+    def reduce_group(self, dialog_data: Dict[str, Dict[str, Any]]):
+        return self.in_group_reduction.reduce(list(dialog_data.items()))
+    def reduce_one(self, intermidate: Tuple[IntermediateType, str, str]):
+        return self.metric.reduce_one(intermidate[0])
+    def reduce(
+        self, intermediates: List[Tuple[IntermediateType, str, str]]
+    ) -> Dict[str, Any]:
+        data: Dict[str, Dict[str, Any]] = {}
+        for intermediate, group_id, item_id in intermediates:
+            if group_id not in data:
+                data[group_id] = {}
+            data[group_id][item_id] = self.metric.reduce_one(intermediate)
+        group_scores: Dict[str, Dict[str, Any]] = {
+            dialog_id: self.reduce_group(dialog_data)
+            for dialog_id, dialog_data in data.items()
+        }
+        return self.cross_group_reduction.reduce(list(group_scores.items()))
+class MultiTurnMetric(
+    GroupMetric[PredictionType, IntermediateType],
+    Generic[PredictionType, IntermediateType],
+):
+    group_id_field = "conversation/id"
+    item_id_field = "conversation/dialog"
+    def _get_item_id(self, task_data):
+        return "assistant_turn_" + str(
+            len(dict_get(task_data, self.item_id_field)) // 2
+        )
 class ReductionInstanceMetric(
     MapReduceMetric[PredictionType, IntermediateType],
     Generic[PredictionType, IntermediateType],
         }
+class MultiTurnToolCallingMetric(ReductionInstanceMetric[str, Dict[str, float]]):
+    """Compares each predicted tool call with list of references tool call."""
+    main_score = "argument_schema_validation"
+    reduction = MeanReduction()
+    prediction_type = List[ToolCall]
+    _requirements_list = ["jsonschema-rs"]
+    def prepare(self):
+        super().prepare()
+        import jsonschema_rs
+        self._schema = jsonschema_rs
+    def map(
+        self,
+        prediction: List[ToolCall],
+        references: List[List[ToolCall]],
+        task_data: Dict[str, Any],
+    ) -> Dict[str, float]:
+        validation_scores = []
+        for tool_call in prediction:
+            parameters = None
+            for tool in task_data["__tools__"]:
+                if tool["function"]["name"] == tool_call["name"]:
+                    parameters = tool["function"]["parameters"]
+            if parameters is None:
+                validation_scores.append(0.0)
+            else:
+                try:
+                    self._schema.validate(
+                        parameters,
+                        tool_call["arguments"],
+                    )
+                    validation_scores.append(1.0)
+                except self._schema.ValidationError:
+                    validation_scores.append(0.0)
+        argument_schema_validation = sum(validation_scores) / len(validation_scores)
+        return {
+            "argument_schema_validation": argument_schema_validation,
+        }
 class MetricWithConfidenceInterval(Metric):
     # The number of resamples used to estimate the confidence intervals of this metric.
     # Use None to disable confidence interval computation.
     process_single_instances = True
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        with error_context(
+            self,
+            stage="Evaluating Metric",
+            help="https://www.unitxt.ai/en/latest/docs/adding_metric.html",
+        ):
+            references = []
+            predictions = []
+            task_data = []
+            instances = []
+            for instance in stream:
+                instance = self.verify_instance(instance)
+                if "score" not in instance:
+                    instance["score"] = {"global": {}, "instance": {}}
+                instance_references, instance_prediction = (
+                    instance["references"],
+                    instance["prediction"],
+                )
+                references.append(instance_references)
+                predictions.append(instance_prediction)
+                instances.append(instance)
+                instance_task_data = (
+                    instance["task_data"] if "task_data" in instance else {}
+                )
+                task_data.append(instance_task_data)
+                instance_score = None
+                # for backward compatibility
+                no_score_value = np.nan
+                if self.process_single_instances:
+                    try:
+                        instance_score = self._compute(
+                            [instance_references],
+                            [instance_prediction],
+                            [instance_task_data],
+                        )
+                    except:
+                        no_score_value = None
+                if not instance_score:
+                    instance_score = {
+                        "score": no_score_value,
+                        "score_name": self.main_score,
+                    }
+                    if isinstance(self.main_score, str):
+                        instance_score[self.main_score] = no_score_value
+                instance["score"]["instance"].update(
+                    self._add_score_prefixes_to_score_dict_and_check_against_existing_scores(
+                        instance_score, instance["score"]["instance"]
+                    )
                 )
+            self._validate_references_and_prediction(references, predictions)
+            global_score = {"num_of_instances": len(instances)}
+            result = self._compute(references, predictions, task_data)
+            global_score.update(
+                self._add_score_prefixes_to_score_dict_and_check_against_existing_scores(
+                    result, global_score
+                )
             )
+            if self.ci_scores:
+                score_names = [
+                    self._add_score_prefix(score_name) for score_name in self.ci_scores
+                ]
+            else:
+                score_names = [global_score["score_name"]]
+            for score_name in score_names:
+                confidence_interval = self.compute_global_confidence_intervals(
+                    references, predictions, task_data, score_name
+                )
+                global_score.update(confidence_interval)
+            for instance in instances:
+                self.update_and_adjust_global_score(instance, global_score)
+                yield instance
     def _compute(
         self,
         return instance
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        with error_context(
+            self,
+            stage="Evaluating Metrics",
+            help="https://www.unitxt.ai/en/latest/docs/adding_metric.html",
+        ):
+            instances = []
+            for instance in stream:
+                self.verify_instance(instance)
+                instance = self.preprocess_instance(instance)
+                instances.append(instance)
+            predictions = [instance["prediction"] for instance in instances]
+            references = [instance["references"] for instance in instances]
+            task_data = [
+                instance["task_data"] if "task_data" in instance else {}
+                for instance in instances
+            ]
+            self._validate_references_and_prediction(references, predictions)
+            global_score = {"num_of_instances": len(instances)}
+            # compute the metric over all refs and preds
+            instance_scores = self.compute(
+                references=references,
+                predictions=predictions,
+                task_data=task_data,
+            )
+            # add the score and score_name fields
+            for instance_score in instance_scores:
+                instance_score["score"] = instance_score[self.main_score]
+                instance_score["score_name"] = self.main_score
+            for instance, score in zip(instances, instance_scores):
+                if "score" not in instance:
+                    instance["score"] = {"global": {}, "instance": {}}
+                instance["score"]["instance"].update(
+                    self._add_score_prefixes_to_score_dict_and_check_against_existing_scores(
+                        score, instance["score"]["instance"]
+                    )
                 )
+            for reduction, fields in self.reduction_map.items():
+                assert (
+                    reduction in self.implemented_reductions
+                ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
+                if reduction == "mean":
+                    for field_name in fields:
+                        field_name_with_prefix = self._add_score_prefix(field_name)
+                        global_score[field_name_with_prefix] = nan_mean(
+                            [
+                                instance["score"]["instance"][field_name_with_prefix]
+                                for instance in instances
+                            ]
+                        )
+                        if field_name == self.main_score:
+                            global_score["score"] = global_score[field_name_with_prefix]
+                            global_score["score_name"] = (
+                                self.score_prefix + self.main_score
+                            )
+                    ci_fields = (
+                        list(set(self.ci_scores))
+                        if self.ci_scores is not None
+                        else [self.main_score]
+                    )
+                    ci_fields_with_prefix = [
+                        self._add_score_prefix(ci_field) for ci_field in ci_fields
+                    ]
+                    confidence_interval = self.score_based_confidence_interval(
+                        instances=instances, score_names=ci_fields_with_prefix
+                    )
+                    global_score.update(confidence_interval)
+                if reduction == "weighted_win_rate":
+                    for field_name in fields:
+                        field_name_with_prefix = self._add_score_prefix(field_name)
+                        total_battles = 0
+                        wins = 0
+                        for instance in instances:
+                            s = instance["score"]["instance"][field_name_with_prefix]
+                            if s > 0:
+                                total_battles += s
+                                wins += s
+                            elif s < 0:
+                                total_battles += abs(s)
+                            else:
+                                total_battles += 2
+                                wins += 1
+                        global_score[field_name_with_prefix] = wins / total_battles
+                        if field_name == self.main_score:
+                            global_score["score"] = global_score[field_name_with_prefix]
+                            global_score["score_name"] = (
+                                self.score_prefix + self.main_score
+                            )
+            for instance in instances:
+                self.update_and_adjust_global_score(instance, global_score)
+                yield instance
     @abstractmethod
     def compute(
             assert isinstance(fields["score_fields"], list)
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        with error_context(
+            self,
+            stage="Evaluating Metrics",
+            help="https://www.unitxt.ai/en/latest/docs/adding_metric.html",
+        ):
+            instance_scores = self.compute_instance_scores(stream)
+            global_score = {"num_of_instances": len(instance_scores)}
+            for reduction_type, reduction_params in self.reduction_map.items():
+                assert (
+                    reduction_type in self.implemented_reductions
+                ), f"Reduction {reduction_type} is not implemented, use one of {self.implemented_reductions}"
+                field_name_full_prefix = ""
+                # used for passing to the bootstrapping, depends on whether the groups are fixed or not
+                aggregation_function = None
+                if reduction_type == "mean":
+                    aggregation_function = self.average_item_scores
+                    reduction_fields = list(set(reduction_params))
+                    # no group reduction, so resample instances individually
+                    scores_to_resample = instance_scores
+                elif reduction_type == "max":
+                    aggregation_function = self.max_item_scores
+                    reduction_fields = list(set(reduction_params))
+                    # no group reduction, so resample instances individually
+                    scores_to_resample = instance_scores
+                elif reduction_type == "group_mean":
+                    aggregation_function = self.average_item_scores
+                    self._validate_group_mean_reduction()
+                    reduction_fields = (
+                        [self.main_score]
+                        if "score_fields" not in reduction_params
+                        else list(set(reduction_params["score_fields"]))
+                    )
+                    aggregation_function_name = str(reduction_params["agg_func"][0])
+                    field_name_full_prefix = "group_" + aggregation_function_name + "_"
+                    do_resample_as_group = reduction_params["agg_func"][2]
+                    if do_resample_as_group:
+                        # append fixed_ to name because resamples the groups as fixed units
+                        field_name_full_prefix = "fixed_" + field_name_full_prefix
+                    (
+                        scores_to_resample,
+                        aggregation_function,
+                    ) = self._set_up_group_mean_aggregation(
+                        instance_scores,
+                        reduction_params,
+                        reduction_fields,
+                    )
+                else:
+                    raise ValueError(
+                        f"Reduction {reduction_type} is not supported, please specify a valid reduction method in reduction_map {self.reduction_map}."
+                    )
+                # calculate global scores for each reduction field
+                for field_name in reduction_fields:
+                    field_name_full = (
+                        field_name_full_prefix + self.score_prefix + field_name
+                    )
+                    # if group resampling (3rd element of agg_func parameter) is True, then
+                    #   1. scores_to_resample are the group scores, and
+                    #   2. aggregation_function is to take the raw mean
+                    # if no group resampling (3rd element of agg_func parameter) is False, then
+                    #   1. scores_to_resample are the original instance scores, and
+                    #   2. aggregation_function is to apply the group aggregation from the instance scores
+                    # either way, the application of aggregation_function to scores_to_resample yields the global score
+                    global_score[field_name_full] = aggregation_function(
+                        scores_to_resample, self.score_prefix + field_name
+                    )
+                    if field_name == self.main_score:
+                        global_score["score"] = global_score[field_name_full]
+                        global_score["score_name"] = field_name_full
+                # need to specify which fields should have CIs calculated for them through ci_scores
+                # (will not automatically calculate CIs for fields in reduction map)
+                if self.ci_scores is not None:
+                    confidence_interval = self.score_based_confidence_interval(
+                        instances=scores_to_resample,
+                        score_names=[
+                            self.score_prefix + ci_score
+                            for ci_score in set(self.ci_scores)
+                        ],
+                        ci_score_prefix=field_name_full_prefix,
+                        aggregation_func=aggregation_function,
+                    )
+                    global_score.update(confidence_interval)
+            for instance in instance_scores:
+                self.update_and_adjust_global_score(instance, global_score)
+            for i, instance in enumerate(stream):
+                instance["score"] = recursive_copy(instance_scores[i]["score"])
+                yield instance
     def compute_instance_scores(
         self, stream: Stream, stream_name: Optional[str] = None
 }
+class SQLExecutionLogicAccuracy(InstanceMetric):
+    sql_timeout: float = 60.0
+    prediction_type = "Any"
+    _requirements_list = ["sqlglot", "func_timeout"]
     main_score = "non_empty_execution_accuracy"
+    all_metrics = [
+        f.name
+        for f in dataclasses_fields(SQLExecutionResult)
+        if isinstance(f.type, type) and f.type in (int, float)
+    ]
+    reduction_map = {"mean": all_metrics}
     ci_scores = [
         "execution_accuracy",
         "non_empty_execution_accuracy",
+        "subset_non_empty_execution_accuracy",
+        "execution_accuracy_bird",
         "gold_sql_runtime",
         "predicted_sql_runtime",
     ]
+    def compute(self, references: List[Any], prediction: str, task_data: Dict) -> dict:
+        from .text2sql_utils import (
+            ALL_DIALECTS,
+            extract_sql_from_text,
+            get_db_connector,
+            get_sql_execution_results,
+            replace_select_clause,
         )
+        predicted_sql = extract_sql_from_text(prediction)
+        gold_sql = references[0]
+        dialect = task_data["db"]["db_type"]
+        if dialect not in ALL_DIALECTS:
+            dialect = None
+        revised_sql = (
+            replace_select_clause(gold_sql, predicted_sql, dialect)
+            if gold_sql and predicted_sql
+            else ""
+        )
+        db_connector = get_db_connector(task_data["db"]["db_type"])(task_data["db"])
+        result_obj = get_sql_execution_results(
+            revised_sql, gold_sql, db_connector, self.sql_timeout
+        )
+        result = asdict(result_obj)
+        result["score"] = result[self.main_score]
+        result["score_name"] = self.main_score
+        logger.debug(f"SQL Execution Accuracy Result: {result}")
+        return result
+class SQLExecutionAccuracy(InstanceMetric):
+    sql_timeout: float = 60.0
+    prediction_type = "Any"
+    _requirements_list = ["sqlglot", "func_timeout"]
+    main_score = "non_empty_execution_accuracy"
+    all_metrics = [
+        f.name
+        for f in dataclasses_fields(SQLExecutionResult)
+        if isinstance(f.type, type) and f.type in (int, float)
+    ]
+    reduction_map = {"mean": all_metrics}
+    ci_scores = [
+        "execution_accuracy",
+        "non_empty_execution_accuracy",
+        "subset_non_empty_execution_accuracy",
+        "execution_accuracy_bird",
+        "gold_sql_runtime",
+        "predicted_sql_runtime",
+    ]
+    def compute(self, references: List[Any], prediction: str, task_data: Dict) -> dict:
+        from .text2sql_utils import (
+            extract_sql_from_text,
+            get_db_connector,
+            get_sql_execution_results,
         )
+        predicted_sql = extract_sql_from_text(prediction)
+        gold_sql = references[0]
+        db_connector = get_db_connector(task_data["db"]["db_type"])(task_data["db"])
+        result_obj = get_sql_execution_results(
+            predicted_sql, gold_sql, db_connector, self.sql_timeout
         )
+        result = asdict(result_obj)
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score
         logger.debug(f"SQL Execution Accuracy Result: {result}")
 class SQLNonExecutionAccuracy(InstanceMetric):
+    all_metrics = [
+        f.name
+        for f in dataclasses_fields(SQLNonExecutionMetricResult)
+        if isinstance(f.type, type) and f.type in (int, float)
     ]
+    reduction_map = {"mean": all_metrics}
+    main_score = "sqlglot_equivalence"
+    ci_scores = all_metrics
     prediction_type = "Any"  # string representation is compared
     _requirements_list = ["sqlglot", "sqlparse"]
     def compute(self, references: List[Any], prediction: str, task_data: Dict) -> dict:
+        from .text2sql_utils import (
+            extract_sql_from_text,
             is_sqlglot_parsable,
             is_sqlparse_parsable,
             sql_exact_match,
             sqlparse_queries_equivalent,
         )
         gold_sql = references[0]
+        predicted_sql = extract_sql_from_text(prediction)
         is_sqlglot_parsable = is_sqlglot_parsable(predicted_sql)
         is_sqlparse_parsable = is_sqlparse_parsable(predicted_sql)
+        result_obj = SQLNonExecutionMetricResult(
+            sqlglot_validity=int(is_sqlglot_parsable),
+            sqlparse_validity=int(is_sqlparse_parsable),
+            sqlglot_equivalence=int(
                 sqlglot_parsed_queries_equivalent(predicted_sql, gold_sql)
                 if is_sqlglot_parsable
                 else 0
             ),
+            sqlglot_optimized_equivalence=int(
                 sqlglot_optimized_equivalence(predicted_sql, gold_sql)
                 if is_sqlglot_parsable
                 else 0
             ),
+            sqlparse_equivalence=int(
                 sqlparse_queries_equivalent(predicted_sql, gold_sql)
                 if is_sqlparse_parsable
                 else 0
             ),
+            sql_exact_match=int(sql_exact_match(predicted_sql, gold_sql)),
+            sql_syntactic_equivalence=0,  # will update below
+        )
+        result_obj.sql_syntactic_equivalence = int(
             any(
+                [
+                    result_obj.sqlglot_equivalence,
+                    result_obj.sqlglot_optimized_equivalence,
+                    result_obj.sqlparse_equivalence,
+                    result_obj.sql_exact_match,
                 ]
             )
         )
+        result = asdict(result_obj)
         logger.debug(f"SQL Non Execution Accuracy Result: {result}")
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score

operator.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pkg_resources import DistributionNotFound, VersionConflict, require
 from .artifact import Artifact
 from .dataclass import FinalField, InternalField, NonPositionalField
 from .settings_utils import get_constants
 from .stream import DynamicStream, EmptyStreamError, MultiStream, Stream
@@ -346,7 +347,8 @@ class StreamOperator(MultiStreamOperator):
     def _process_stream(
         self, stream: Stream, stream_name: Optional[str] = None
     ) -> Generator:
-        yield from self.process(stream, stream_name)
     @abstractmethod
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
@@ -384,12 +386,28 @@ class PagedStreamOperator(StreamOperator):
         self, stream: Stream, stream_name: Optional[str] = None
     ) -> Generator:
         page = []
         for instance in stream:
             page.append(instance)
             if len(page) >= self.page_size:
-                yield from self.process(page, stream_name)
                 page = []
-        yield from self._process_page(page, stream_name)
     def _process_page(
         self, page: List[Dict], stream_name: Optional[str] = None
@@ -442,17 +460,9 @@ class InstanceOperator(StreamOperator):
     def _process_stream(
         self, stream: Stream, stream_name: Optional[str] = None
     ) -> Generator:
-        try:
-            _index = None
-            for _index, instance in enumerate(stream):
                 yield self._process_instance(instance, stream_name)
-        except Exception as e:
-            if _index is None:
-                raise e
-            else:
-                raise ValueError(
-                    f"Error processing instance '{_index}' from stream '{stream_name}' in {self.__class__.__name__} due to the exception above."
-                ) from e
     def _process_instance(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None

 from .artifact import Artifact
 from .dataclass import FinalField, InternalField, NonPositionalField
+from .error_utils import error_context
 from .settings_utils import get_constants
 from .stream import DynamicStream, EmptyStreamError, MultiStream, Stream
     def _process_stream(
         self, stream: Stream, stream_name: Optional[str] = None
     ) -> Generator:
+        with error_context(self, stream=stream_name):
+            yield from self.process(stream, stream_name)
     @abstractmethod
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         self, stream: Stream, stream_name: Optional[str] = None
     ) -> Generator:
         page = []
+        page_number = 0
         for instance in stream:
             page.append(instance)
             if len(page) >= self.page_size:
+                with error_context(
+                    self,
+                    stream=stream_name,
+                    page=page_number,
+                    page_size=len(page),
+                ):
+                    yield from self.process(page, stream_name)
                 page = []
+                page_number += 1
+        if page:  # Handle any remaining instances in the last partial page
+            with error_context(
+                self,
+                stream=stream_name,
+                page=page_number,
+                page_size=len(page),
+                final_page=True,
+            ):
+                yield from self._process_page(page, stream_name)
     def _process_page(
         self, page: List[Dict], stream_name: Optional[str] = None
     def _process_stream(
         self, stream: Stream, stream_name: Optional[str] = None
     ) -> Generator:
+        for _index, instance in enumerate(stream):
+            with error_context(self, stream=stream_name, instance=_index):
                 yield self._process_instance(instance, stream_name)
     def _process_instance(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None

operators.py CHANGED Viewed

@@ -67,7 +67,7 @@ from .artifact import Artifact, fetch_artifact
 from .dataclass import NonPositionalField, OptionalField
 from .deprecation_utils import deprecation
 from .dict_utils import dict_delete, dict_get, dict_set, is_subpath
-from .error_utils import UnitxtError
 from .generator_utils import ReusableGenerator
 from .operator import (
     InstanceOperator,
@@ -309,7 +309,9 @@ def recursive_key_value_replace(data, target_key, value_map, value_remove=None):
                         if not isinstance(item, dict) and item not in value_remove
                     ]
                 elif isinstance(value, dict):
-                    pass  # Skip or handle dict values if needed
                 elif value in value_remove:
                     keys_to_delete.append(key)
                 elif value in value_map:
@@ -436,6 +438,7 @@ class InstanceFieldOperator(InstanceOperator):
     field_to_field: Optional[Union[List[List[str]], Dict[str, str]]] = None
     use_query: Optional[bool] = None
     process_every_value: bool = False
     get_default: Any = None
     not_exist_ok: bool = False
     not_exist_do_nothing: bool = False
@@ -521,7 +524,7 @@ class InstanceFieldOperator(InstanceOperator):
     ) -> Dict[str, Any]:
         self.verify_field_definition()
         for from_field, to_field in self._field_to_field:
-            try:
                 old_value = dict_get(
                     instance,
                     from_field,
@@ -532,11 +535,8 @@ class InstanceFieldOperator(InstanceOperator):
                     if self.not_exist_do_nothing:
                         continue
                     old_value = self.get_default
-            except Exception as e:
-                raise ValueError(
-                    f"Failed to get '{from_field}' from instance due to the exception above."
-                ) from e
-            try:
                 if self.process_every_value:
                     new_value = [
                         self.process_instance_value(value, instance)
@@ -544,15 +544,13 @@ class InstanceFieldOperator(InstanceOperator):
                     ]
                 else:
                     new_value = self.process_instance_value(old_value, instance)
-            except Exception as e:
-                raise ValueError(
-                    f"Failed to process field '{from_field}' from instance due to the exception above."
-                ) from e
             dict_set(
                 instance,
                 to_field,
                 new_value,
                 not_exist_ok=True,
             )
         return instance
@@ -610,11 +608,29 @@ class Rename(FieldOperator):
         return res
 @deprecation(version="2.0.0", alternative=Rename)
 class RenameFields(Rename):
     pass
 class AddConstant(FieldOperator):
     """Adds a constant, being argument 'add', to the processed value.
@@ -1200,9 +1216,10 @@ class ApplyOperatorsField(InstanceOperator):
     ) -> Dict[str, Any]:
         operator_names = instance.get(self.operators_field)
         if operator_names is None:
-            assert (
-                self.default_operators is not None
-            ), f"No operators found in field '{self.operators_field}', and no default operators provided."
             operator_names = self.default_operators
         if isinstance(operator_names, str):
@@ -1436,7 +1453,7 @@ class ExecuteExpression(InstanceOperator, ComputeExpressionMixin):
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
-        instance[self.to_field] = self.compute_expression(instance)
         return instance
@@ -1821,54 +1838,58 @@ class ApplyMetric(StreamOperator, ArtifactFetcherMixin):
         # to be populated only when two or more metrics
         accumulated_scores = []
-        first_instance = stream.peek()
-        metric_names = first_instance.get(self.metric_field, [])
-        if not metric_names:
-            raise RuntimeError(
-                f"Missing metric names in field '{self.metric_field}' and instance '{first_instance}'."
-            )
-        if isinstance(metric_names, str):
-            metric_names = [metric_names]
-        metrics_list = []
-        for metric_name in metric_names:
-            metric = self.get_artifact(metric_name)
-            if isinstance(metric, MetricsList):
-                metrics_list.extend(list(metric.items))
-            elif isinstance(metric, Metric):
-                metrics_list.append(metric)
-            else:
-                raise ValueError(
-                    f"Operator {metric_name} must be a Metric or MetricsList"
                 )
-        for metric in metrics_list:
-            metric.set_confidence_interval_calculation(self.calc_confidence_intervals)
-        # Each metric operator computes its score and then sets the main score, overwriting
-        # the previous main score value (if any). So, we need to reverse the order of the listed metrics.
-        # This will cause the first listed metric to run last, and the main score will be set
-        # by the first listed metric (as desired).
-        metrics_list = list(reversed(metrics_list))
-        for i, metric in enumerate(metrics_list):
-            if i == 0:  # first metric
-                multi_stream = MultiStream({"tmp": stream})
-            else:  # metrics with previous scores
-                reusable_generator = ReusableGenerator(
-                    generator=update_scores_of_stream_instances,
-                    gen_kwargs={"stream": stream, "scores": accumulated_scores},
                 )
-                multi_stream = MultiStream.from_generators({"tmp": reusable_generator})
-            multi_stream = metric(multi_stream)
-            if i < len(metrics_list) - 1:  # last metric
-                accumulated_scores = []
-                for inst in multi_stream["tmp"]:
-                    accumulated_scores.append(recursive_copy(inst["score"]))
         yield from multi_stream["tmp"]

 from .dataclass import NonPositionalField, OptionalField
 from .deprecation_utils import deprecation
 from .dict_utils import dict_delete, dict_get, dict_set, is_subpath
+from .error_utils import UnitxtError, error_context
 from .generator_utils import ReusableGenerator
 from .operator import (
     InstanceOperator,
                         if not isinstance(item, dict) and item not in value_remove
                     ]
                 elif isinstance(value, dict):
+                    recursive_key_value_replace(
+                        value, target_key, value_map, value_remove
+                    )
                 elif value in value_remove:
                     keys_to_delete.append(key)
                 elif value in value_map:
     field_to_field: Optional[Union[List[List[str]], Dict[str, str]]] = None
     use_query: Optional[bool] = None
     process_every_value: bool = False
+    set_every_value: bool = NonPositionalField(default=False)
     get_default: Any = None
     not_exist_ok: bool = False
     not_exist_do_nothing: bool = False
     ) -> Dict[str, Any]:
         self.verify_field_definition()
         for from_field, to_field in self._field_to_field:
+            with error_context(self, field=from_field, action="Read Field"):
                 old_value = dict_get(
                     instance,
                     from_field,
                     if self.not_exist_do_nothing:
                         continue
                     old_value = self.get_default
+            with error_context(self, field=from_field, action="Process Field"):
                 if self.process_every_value:
                     new_value = [
                         self.process_instance_value(value, instance)
                     ]
                 else:
                     new_value = self.process_instance_value(old_value, instance)
             dict_set(
                 instance,
                 to_field,
                 new_value,
                 not_exist_ok=True,
+                set_multiple=self.set_every_value,
             )
         return instance
         return res
+class Move(InstanceOperator):
+    field: str
+    to_field: str
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        value = dict_get(instance, self.field)
+        dict_delete(instance, self.field)
+        dict_set(instance, self.to_field, value=value)
+        return instance
 @deprecation(version="2.0.0", alternative=Rename)
 class RenameFields(Rename):
     pass
+class BytesToString(FieldOperator):
+    def process_value(self, value: Any) -> Any:
+        return str(value)
 class AddConstant(FieldOperator):
     """Adds a constant, being argument 'add', to the processed value.
     ) -> Dict[str, Any]:
         operator_names = instance.get(self.operators_field)
         if operator_names is None:
+            if self.default_operators is None:
+                raise ValueError(
+                    f"No operators found in field '{self.operators_field}', and no default operators provided."
+                )
             operator_names = self.default_operators
         if isinstance(operator_names, str):
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
+        dict_set(instance, self.to_field, self.compute_expression(instance))
         return instance
         # to be populated only when two or more metrics
         accumulated_scores = []
+        with error_context(self, stage="Load Metrics"):
+            first_instance = stream.peek()
+            metric_names = first_instance.get(self.metric_field, [])
+            if not metric_names:
+                raise RuntimeError(
+                    f"Missing metric names in field '{self.metric_field}' and instance '{first_instance}'."
                 )
+            if isinstance(metric_names, str):
+                metric_names = [metric_names]
+            metrics_list = []
+            for metric_name in metric_names:
+                metric = self.get_artifact(metric_name)
+                if isinstance(metric, MetricsList):
+                    metrics_list.extend(list(metric.items))
+                elif isinstance(metric, Metric):
+                    metrics_list.append(metric)
+                else:
+                    raise ValueError(
+                        f"Operator {metric_name} must be a Metric or MetricsList"
+                    )
+        with error_context(self, stage="Setup Metrics"):
+            for metric in metrics_list:
+                metric.set_confidence_interval_calculation(
+                    self.calc_confidence_intervals
                 )
+            # Each metric operator computes its score and then sets the main score, overwriting
+            # the previous main score value (if any). So, we need to reverse the order of the listed metrics.
+            # This will cause the first listed metric to run last, and the main score will be set
+            # by the first listed metric (as desired).
+            metrics_list = list(reversed(metrics_list))
+            for i, metric in enumerate(metrics_list):
+                if i == 0:  # first metric
+                    multi_stream = MultiStream({"tmp": stream})
+                else:  # metrics with previous scores
+                    reusable_generator = ReusableGenerator(
+                        generator=update_scores_of_stream_instances,
+                        gen_kwargs={"stream": stream, "scores": accumulated_scores},
+                    )
+                    multi_stream = MultiStream.from_generators(
+                        {"tmp": reusable_generator}
+                    )
+                multi_stream = metric(multi_stream)
+                if i < len(metrics_list) - 1:  # last metric
+                    accumulated_scores = []
+                    for inst in multi_stream["tmp"]:
+                        accumulated_scores.append(recursive_copy(inst["score"]))
         yield from multi_stream["tmp"]

processors.py CHANGED Viewed

@@ -98,6 +98,16 @@ class ExtractWithRegex(RegexParser):
         return ""
 class ListToEmptyEntitiesTuples(FieldOperator):
     def process_value(self, lst: Any) -> Any:
         try:
@@ -286,7 +296,7 @@ class StringOrNotString(StringEquals):
 class ExtractMtBenchRatingJudgment(FieldOperator):
     def process_value(self, text: Any) -> Any:
-        match = re.search(r"\[\[([\d]+\.?[\d]*)\]\]", text)
         try:
             return float(match.group(1)) / 10
         except:

         return ""
+class GroupDictWithRegex(FieldOperator):
+    pattern: str
+    def process_value(self, value: Any) -> Any:
+        match = re.match(self.pattern, value)
+        if match:
+            return match.groupdict()
+        return {}
 class ListToEmptyEntitiesTuples(FieldOperator):
     def process_value(self, lst: Any) -> Any:
         try:
 class ExtractMtBenchRatingJudgment(FieldOperator):
     def process_value(self, text: Any) -> Any:
+        match = re.search(r"\[\[([\s*\d]+\.?[\d]*\s*)(/\s*10)?\s*\]\]", text)
         try:
             return float(match.group(1)) / 10
         except:

schema.py CHANGED Viewed

@@ -59,7 +59,7 @@ def get_schema(stream_name):
 def load_chat_source(chat_str):
     chat = json.loads(chat_str)
     for turn in chat:
-        if isinstance(turn["content"], list):
             for content in turn["content"]:
                 if content["type"] == "image_url":
                     content["image_url"]["url"] = ImageDataString(

 def load_chat_source(chat_str):
     chat = json.loads(chat_str)
     for turn in chat:
+        if "content" in turn and isinstance(turn["content"], list):
             for content in turn["content"]:
                 if content["type"] == "image_url":
                     content["image_url"]["url"] = ImageDataString(

serializers.py CHANGED Viewed

@@ -9,6 +9,7 @@ from .operators import InstanceFieldOperator
 from .settings_utils import get_constants
 from .type_utils import isoftype, to_type_string
 from .types import (
     Dialog,
     Document,
     Image,
@@ -75,7 +76,22 @@ class DialogSerializer(SingleTypeSerializer):
     def serialize(self, value: Dialog, instance: Dict[str, Any]) -> str:
         # Convert the Dialog into a string representation, typically combining roles and content
-        return "\n".join(f"{turn['role']}: {turn['content']}" for turn in value)
 class NumberSerializer(SingleTypeSerializer):
@@ -225,7 +241,7 @@ class SQLDatabaseAsSchemaSerializer(SingleTypeSerializer):
     serialized_type = SQLDatabase
     def serialize(self, value: SQLDatabase, instance: Dict[str, Any]) -> str:
-        from .sql_utils import get_db_connector
         connector = get_db_connector(value["db_type"])(value)
         return connector.get_table_schema()

 from .settings_utils import get_constants
 from .type_utils import isoftype, to_type_string
 from .types import (
+    Conversation,
     Dialog,
     Document,
     Image,
     def serialize(self, value: Dialog, instance: Dict[str, Any]) -> str:
         # Convert the Dialog into a string representation, typically combining roles and content
+        turns = []
+        for turn in value:
+            turn_str = f"{turn['role']}: "
+            if "content" in turn:
+                turn_str += str(turn["content"])
+            if "tool_calls" in turn:
+                turn_str += "\n" + json.dumps(turn["tool_calls"])
+            turns.append(turn_str)
+        return "\n".join(turns)
+class ConversationSerializer(DialogSerializer):
+    serialized_type = Conversation
+    def serialize(self, value: Conversation, instance: Dict[str, Any]) -> str:
+        return super().serialize(value["dialog"], instance)
 class NumberSerializer(SingleTypeSerializer):
     serialized_type = SQLDatabase
     def serialize(self, value: SQLDatabase, instance: Dict[str, Any]) -> str:
+        from .text2sql_utils import get_db_connector
         connector = get_db_connector(value["db_type"])(value)
         return connector.get_table_schema()

settings_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import importlib.metadata
 import importlib.util
 import os
 from contextlib import contextmanager
 from .version import version
@@ -177,6 +178,9 @@ if Constants.is_uninitilized():
     constants.dataset_url = "unitxt/data"
     constants.metric_url = "unitxt/metric"
     constants.version = version
     constants.catalog_hierarchy_sep = "."
     constants.env_local_catalogs_paths_sep = ":"
     constants.non_registered_files = [

 import importlib.metadata
 import importlib.util
 import os
+import sys
 from contextlib import contextmanager
 from .version import version
     constants.dataset_url = "unitxt/data"
     constants.metric_url = "unitxt/metric"
     constants.version = version
+    constants.python = (
+        f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
+    )
     constants.catalog_hierarchy_sep = "."
     constants.env_local_catalogs_paths_sep = ":"
     constants.non_registered_files = [

struct_data_operators.py CHANGED Viewed

@@ -23,6 +23,7 @@ For key-value pairs, expected input format is:
     {"key1": "value1", "key2": value2, "key3": "value3"}
 """
 import json
 import random
 from abc import ABC, abstractmethod
@@ -754,11 +755,40 @@ class LoadJson(FieldOperator):
             return json.loads(value, strict=False)
 class ToolCallPostProcessor(FieldOperator):
     failure_value: Any = None
     allow_failure: bool = False
     def process_value(self, value: str) -> ToolCall:
         if self.allow_failure:
             try:
                 result = json.loads(value)
@@ -776,6 +806,25 @@ class ToolCallPostProcessor(FieldOperator):
         return result
 class DumpJson(FieldOperator):
     def process_value(self, value: str) -> str:
         return json.dumps(value)

     {"key1": "value1", "key2": value2, "key3": "value3"}
 """
+import ast
 import json
 import random
 from abc import ABC, abstractmethod
             return json.loads(value, strict=False)
+class PythonCallProcessor(FieldOperator):
+    def process_value(self, value: str) -> ToolCall:
+        expr = ast.parse(value, mode="eval").body
+        function = expr.func.id
+        args = {}
+        for kw in expr.keywords:
+            args[kw.arg] = ast.literal_eval(kw.value)
+        # Handle positional args, if any
+        if expr.args:
+            args["_args"] = [ast.literal_eval(arg) for arg in expr.args]
+        return {"name": function, "arguments": args}
+def extract_possible_json_str(text):
+    """Extract potential JSON string from text by finding outermost braces/brackets."""
+    # Find first opening delimiter
+    start_positions = [pos for pos in [text.find("{"), text.find("[")] if pos != -1]
+    start = min(start_positions) if start_positions else 0
+    # Find last closing delimiter
+    end_positions = [pos for pos in [text.rfind("}"), text.rfind("]")] if pos != -1]
+    end = max(end_positions) if end_positions else len(text) - 1
+    return text[start : end + 1]
 class ToolCallPostProcessor(FieldOperator):
     failure_value: Any = None
     allow_failure: bool = False
     def process_value(self, value: str) -> ToolCall:
+        value = extract_possible_json_str(
+            value
+        )  # clear tokens such as <tool_call> focusing on the call json itself
         if self.allow_failure:
             try:
                 result = json.loads(value)
         return result
+class MultipleToolCallPostProcessor(FieldOperator):
+    failure_value: Any = None
+    allow_failure: bool = False
+    def process_value(self, value: str) -> List[ToolCall]:
+        if self.allow_failure:
+            try:
+                result = json.loads(value)
+            except json.JSONDecodeError:
+                return self.failure_value
+        else:
+            result = json.loads(value, strict=False)
+        if isoftype(result, List[ToolCall]):
+            return result
+        if not isoftype(result, ToolCall):
+            return self.failure_value
+        return [result]
 class DumpJson(FieldOperator):
     def process_value(self, value: str) -> str:
         return json.dumps(value)

task.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Union
 from .artifact import fetch_artifact
 from .deprecation_utils import deprecation
-from .error_utils import Documentation, UnitxtError, UnitxtWarning
 from .logging_utils import get_logger
 from .metrics import MetricsList
 from .operator import InstanceOperator
@@ -285,13 +285,18 @@ class Task(InstanceOperator, ArtifactFetcherMixin):
     ) -> Dict[str, Any]:
         instance = self.set_default_values(instance)
-        verify_required_schema(
-            self.input_fields,
-            instance,
-            class_name="Task",
-            id=self.__id__,
-            description=self.__description__,
-        )
         input_fields = {key: instance[key] for key in self.input_fields.keys()}
         data_classification_policy = instance.get("data_classification_policy", [])

 from .artifact import fetch_artifact
 from .deprecation_utils import deprecation
+from .error_utils import Documentation, UnitxtError, UnitxtWarning, error_context
 from .logging_utils import get_logger
 from .metrics import MetricsList
 from .operator import InstanceOperator
     ) -> Dict[str, Any]:
         instance = self.set_default_values(instance)
+        with error_context(
+            self,
+            stage="Schema Verification",
+            help="https://www.unitxt.ai/en/latest/docs/adding_task.html",
+        ):
+            verify_required_schema(
+                self.input_fields,
+                instance,
+                class_name="Task",
+                id=self.__id__,
+                description=self.__description__,
+            )
         input_fields = {key: instance[key] for key in self.input_fields.keys()}
         data_classification_policy = instance.get("data_classification_policy", [])

templates.py CHANGED Viewed

@@ -6,11 +6,12 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from .artifact import Artifact
 from .collections import DictCollection, ListCollection
 from .dataclass import NonPositionalField
-from .dict_utils import dict_set
 from .error_utils import Documentation, UnitxtError
 from .operator import InstanceOperator, Operator
 from .random_utils import new_random_generator
 from .serializers import (
     DialogSerializer,
     ImageSerializer,
     ListSerializer,
@@ -68,6 +69,7 @@ class Template(InstanceOperator):
                 ToolCallSerializer(),
                 ToolsSerializer(),
                 DialogSerializer(),
                 ListSerializer(),
                 SQLDatabaseAsSchemaSerializer(),
             ]
@@ -942,6 +944,16 @@ class MultiReferenceTemplate(InputOutputTemplate):
         return target, references
 def escape_chars(s, chars_to_escape):
     for char in chars_to_escape:
         s = s.replace(char, f"\\{char}")

 from .artifact import Artifact
 from .collections import DictCollection, ListCollection
 from .dataclass import NonPositionalField
+from .dict_utils import dict_get, dict_set
 from .error_utils import Documentation, UnitxtError
 from .operator import InstanceOperator, Operator
 from .random_utils import new_random_generator
 from .serializers import (
+    ConversationSerializer,
     DialogSerializer,
     ImageSerializer,
     ListSerializer,
                 ToolCallSerializer(),
                 ToolsSerializer(),
                 DialogSerializer(),
+                ConversationSerializer(),
                 ListSerializer(),
                 SQLDatabaseAsSchemaSerializer(),
             ]
         return target, references
+class MultiTurnTemplate(MultiReferenceTemplate):
+    input_format = ""
+    turns_field: str
+    def post_process_instance(self, instance):
+        turns = dict_get(instance["input_fields"], self.turns_field)
+        instance["__turns__"] = turns
+        return super().post_process_instance(instance)
 def escape_chars(s, chars_to_escape):
     for char in chars_to_escape:
         s = s.replace(char, f"\\{char}")

sql_utils.py → text2sql_utils.py RENAMED Viewed

@@ -7,9 +7,13 @@ import re
 import sqlite3
 import time
 from abc import ABC, abstractmethod
 from functools import lru_cache
-from typing import Any, List, Optional
 import requests
 from huggingface_hub import snapshot_download
 from requests.exceptions import ConnectionError, ReadTimeout
@@ -539,6 +543,17 @@ def get_db_connector(db_type: str):
     return connector
 def is_sqlglot_parsable(sql: str, db_type="sqlite") -> bool:
     """Returns True if sqlglot does not encounter any error, False otherwise."""
     from sqlglot import parse
@@ -695,7 +710,7 @@ def extract_select_info(sql: str):
 def sqlparse_queries_equivalent(sql1: str, sql2: str) -> bool:
-    """Return True if both SQL queries are naively considered equivalent."""
     try:
         info1 = extract_select_info(sql1)
         info2 = extract_select_info(sql2)
@@ -713,6 +728,7 @@ def sqlparse_queries_equivalent(sql1: str, sql2: str) -> bool:
 def sqlglot_parsed_queries_equivalent(sql1: str, sql2: str, dialect: str = "") -> bool:
     from sqlglot import exp, parse_one
     try:
@@ -754,3 +770,473 @@ def sql_exact_match(sql1: str, sql2: str) -> bool:
         return s.upper()
     return normalize_sql(sql1) == normalize_sql(sql2)

 import sqlite3
 import time
 from abc import ABC, abstractmethod
+from collections import Counter
+from dataclasses import dataclass
 from functools import lru_cache
+from typing import Any, List, Optional, Tuple
+import numpy as np
+import pandas as pd
 import requests
 from huggingface_hub import snapshot_download
 from requests.exceptions import ConnectionError, ReadTimeout
     return connector
+@dataclass
+class SQLNonExecutionMetricResult:
+    sqlglot_validity: int  # Whether SQL parses with sqlglot
+    sqlparse_validity: int  # Whether SQL parses with sqlparse
+    sqlglot_equivalence: int  # Semantic equivalence using sqlglot AST
+    sqlglot_optimized_equivalence: int  # Equivalence after optimization via sqlglot
+    sqlparse_equivalence: int  # Equivalence using sqlparse AST
+    sql_exact_match: int  # Exact string match of predicted and gold SQL
+    sql_syntactic_equivalence: int  # Any of the above equivalence conditions hold
 def is_sqlglot_parsable(sql: str, db_type="sqlite") -> bool:
     """Returns True if sqlglot does not encounter any error, False otherwise."""
     from sqlglot import parse
 def sqlparse_queries_equivalent(sql1: str, sql2: str) -> bool:
+    """Returns True if both SQL queries are naively considered equivalent."""
     try:
         info1 = extract_select_info(sql1)
         info2 = extract_select_info(sql2)
 def sqlglot_parsed_queries_equivalent(sql1: str, sql2: str, dialect: str = "") -> bool:
+    """Return True if two SQL queries match after parsing with SQLGlot."""
     from sqlglot import exp, parse_one
     try:
         return s.upper()
     return normalize_sql(sql1) == normalize_sql(sql2)
+@dataclass
+class SQLExecutionResult:
+    execution_accuracy: int  # Whether the predicted and gold SQL results match exactly
+    non_empty_execution_accuracy: (
+        int  # Same as execution_accuracy but only if gold is non-empty
+    )
+    subset_non_empty_execution_accuracy: (
+        int  # Whether predicted is a subset of gold or vice versa, non-empty only
+    )
+    execution_accuracy_bird: (
+        int  # Whether the predicted SQL matches gold using BIRD evaluation logic
+    )
+    non_empty_gold_df: int  # Whether the gold SQL produced a non-empty dataframe
+    gold_sql_runtime: float  # Time taken to execute the gold SQL
+    predicted_sql_runtime: float  # Time taken to execute the predicted SQL
+    pred_to_gold_runtime_ratio: float  # Ratio of predicted runtime to gold runtime
+    gold_error: int  # Whether the gold SQL had an execution error
+    predicted_error: int  # Whether the predicted SQL had an execution error
+    gold_df_json: str  # JSON representation of the gold SQL result dataframe
+    predicted_df_json: str  # JSON representation of the predicted SQL result dataframe
+    error_message: str  # Error message from predicted execution if any
+def compare_dfs_ignore_colnames_ordered_rows(
+    df1: pd.DataFrame, df2: pd.DataFrame
+) -> bool:
+    if df1.shape != df2.shape:
+        return False
+    df1_sorted_rows = np.array([np.sort(row) for row in df1.values.astype(str)])
+    df2_sorted_rows = np.array([np.sort(row) for row in df2.values.astype(str)])
+    return np.array_equal(df1_sorted_rows, df2_sorted_rows)
+def compare_dfs_ignore_colnames_unordered_rows(
+    df1: pd.DataFrame, df2: pd.DataFrame
+) -> bool:
+    if df1.shape != df2.shape:
+        return False
+    df1_sorted = np.sort(np.sort(df1.values.astype(str), axis=1), axis=0)
+    df2_sorted = np.sort(np.sort(df2.values.astype(str), axis=1), axis=0)
+    return np.array_equal(df1_sorted, df2_sorted)
+def compare_dfs_ignore_colnames_subset(
+    df1: pd.DataFrame, df2: pd.DataFrame, ignore_row_order: bool = True
+) -> bool:
+    """Checks if the smaller of the two DataFrames is likely a subset of the other.
+    Subset comparison is column-based, to support Text2SQL evaluation for when the
+    predicted SQL dataframe has missing or additional columns. Each row is treated as
+    a multiset of (stringified) values, and the function checks if every row in the
+    smaller DataFrame (by column count) is a multiset subset of the corresponding row
+    in the larger DataFrame. When ground truth SQL does not have ORDER BY,
+    ignore_row_order can be set to True to ignore the order of rows. In this case,
+    column values are sorted before comparison. This means that there could be cases
+    where the dataframes have the exact same number of rows and column values after
+    sort are the same, but the dataframes are not actually a subset of each other.
+    This is unlikely to happen in practice, but the score is not guaranteed to be
+    100% accurate and may overestimate the accuracy.
+    Args:
+        df1 (pd.DataFrame): The first DataFrame to compare.
+        df2 (pd.DataFrame): The second DataFrame to compare.
+        ignore_row_order (bool, optional): If True, ignores the order of rows by
+            sorting them before comparison. Defaults to True.
+    Returns:
+        bool: True if the smaller DataFrame (column-wise) is likely a subset of the
+            larger one, False otherwise.
+    """
+    def row_to_multiset(row):
+        return Counter(str(x) for x in row)
+    def rows_to_multisets(df):
+        return [row_to_multiset(row) for row in df.values]
+    def sort_df(df):
+        sorted_df = df.copy()
+        for col in sorted_df.columns:
+            sorted_df[col] = sorted_df[col].astype(str).sort_values(ignore_index=True)
+        return sorted_df
+    if df1.empty or df2.empty or len(df1) != len(df2):
+        return False
+    subset_df, superset_df = (df1, df2) if df1.shape[1] <= df2.shape[1] else (df2, df1)
+    if ignore_row_order:
+        subset_df = sort_df(subset_df)
+        superset_df = sort_df(superset_df)
+    subset_rows = rows_to_multisets(subset_df)
+    superset_rows = rows_to_multisets(superset_df)
+    for r1, r2 in zip(subset_rows, superset_rows):
+        if not all(r1[k] <= r2.get(k, 0) for k in r1):
+            return False
+    return True
+def compare_dfs_bird_eval_logic(df1: pd.DataFrame, df2: pd.DataFrame):
+    """Check if two SQL query result sets are exactly equal, as in BIRD evaluation.
+    This function checks if the set of rows returned by the predicted SQL query
+    (`predicted_res`) is exactly equal to the set of rows returned by the ground truth
+    SQL query (`ground_truth_res`). This is the logic used in the original BIRD
+    evaluation code:
+    https://github.com/AlibabaResearch/DAMO-ConvAI/blob/main/bird/llm/src/evaluation.py.
+    """
+    df1_set = {tuple(row) for row in df1.values.astype(str)}
+    df2_set = {tuple(row) for row in df2.values.astype(str)}
+    return int(df1_set == df2_set)
+def compare_result_dfs(
+    gold_df: pd.DataFrame, pred_df: pd.DataFrame, gold_sql: str
+) -> Tuple[int, int, int]:
+    """Compares two DataFrames representing SQL query results.
+    Args:
+        gold_df (pd.DataFrame): The ground truth DataFrame.
+        pred_df (pd.DataFrame): The predicted DataFrame.
+        gold_sql (str): The ground truth SQL query string.
+    Returns:
+        Tuple[int, int, int]: A tuple containing:
+            - match (int): 1 if the predicted DataFrame matches the gold DataFrame
+            - non_empty_match (int): 1 if both DataFrames are non-empty and match,
+              0 otherwise.
+            - subset_match (int): 1 if the predicted DataFrame is a subset or
+              superset of the gold DataFrame.
+    Notes:
+        - The comparison ignores column names.
+        - Row order is considered only if 'ORDER BY' is present in the SQL query.
+    """
+    subset_match = 0
+    non_empty_match = 0
+    if "ORDER BY" in gold_sql.upper():
+        match = int(compare_dfs_ignore_colnames_ordered_rows(pred_df, gold_df))
+        if not gold_df.empty and not pred_df.empty:
+            non_empty_match = match
+            if compare_dfs_ignore_colnames_subset(
+                gold_df, pred_df, ignore_row_order=False
+            ):
+                subset_match = 1
+    else:
+        match = int(compare_dfs_ignore_colnames_unordered_rows(pred_df, gold_df))
+        if not gold_df.empty and not pred_df.empty:
+            non_empty_match = match
+            if compare_dfs_ignore_colnames_subset(
+                gold_df, pred_df, ignore_row_order=True
+            ):
+                subset_match = 1
+    return match, non_empty_match, subset_match
+def run_query(
+    sql: str, connector, sql_timeout: float
+) -> Tuple[Optional[pd.DataFrame], float, str]:
+    """Executes a SQL query using the provided connector with a timeout.
+    Args:
+        sql (str): The SQL query string to execute.
+        connector: An object with an `execute_query` method that executes the SQL
+            query.
+        sql_timeout (float): The maximum time in seconds to allow for query
+            execution.
+    Returns:
+        Tuple[Optional[pd.DataFrame], float, str]:
+            - A pandas DataFrame containing the query results, or None if an error
+              occurred.
+            - The duration in seconds taken to execute the query. 0.0 if an error.
+            - An error message string if an error occurred, otherwise an empty
+              string.
+    Notes:
+        - If the SQL string is empty or only whitespace, returns immediately with a
+          message.
+        - If the query execution exceeds the timeout, returns a timeout error
+          message.
+        - Any other exceptions are caught and returned as error messages.
+    """
+    import time
+    from func_timeout import func_timeout
+    from func_timeout.exceptions import FunctionTimedOut
+    if not sql.strip():
+        return None, 0.0, "No SQL query found in the prediction."
+    try:
+        start = time.perf_counter()
+        result, error = func_timeout(sql_timeout, connector.execute_query, args=(sql,))
+        duration = time.perf_counter() - start
+        if isinstance(result, dict) and "results" in result:
+            result = result["results"]
+        if error:
+            return None, duration, error
+        return pd.DataFrame(result), duration, ""
+    except FunctionTimedOut as e:
+        return None, 0.0, f"Timeout: {e}"
+    except Exception as e:
+        return None, 0.0, f"Error: {e}"
+def get_sql_execution_results(
+    predicted_sql: str, gold_sql: str, connector, sql_timeout: float
+) -> SQLExecutionResult:
+    """Execute and compare predicted and gold SQL queries, returning execution metrics.
+    Args:
+        predicted_sql (str): The SQL query predicted by the model.
+        gold_sql (str): The reference (gold) SQL query.
+        connector: Database connector object used to execute the queries.
+        sql_timeout (float): Maximum time (in seconds) allowed for query execution.
+    Returns:
+        SQLExecutionResult: An object containing various execution metrics, including:
+            - execution_accuracy (int): 1 if predicted and gold queries produce
+              equivalent results, else 0.
+            - non_empty_execution_accuracy (int): 1 if both queries produce non-empty
+              and equivalent results, else 0.
+            - subset_non_empty_execution_accuracy (int): 1 if predicted results are a
+              subset or superset of gold results and non-empty, else 0. Subset
+              comparison is column-based. This means that the predicted SQL dataframe
+              can have missing or additional columns compared to the gold SQL dataframe.
+            - execution_accuracy_bird (int): 1 if results match according to BIRD
+              evaluation logic, else 0.
+            - non_empty_gold_df (int): 1 if the gold query result is non-empty, else 0.
+            - gold_sql_runtime (float): Execution time for the gold SQL query.
+            - predicted_sql_runtime (float): Execution time for the predicted SQL query.
+            - pred_to_gold_runtime_ratio (float): Ratio of predicted to gold query
+              runtimes.
+            - gold_error (int): 1 if the gold query failed, else 0.
+            - predicted_error (int): 1 if the predicted query failed, else 0.
+            - gold_df_json (str): JSON representation of the gold query result
+              DataFrame.
+            - predicted_df_json (str): JSON representation of the predicted query
+              result DataFrame.
+            - error_message (str): Error message if any query failed, else empty
+              string.
+    Notes:
+        - If the gold query fails, the function returns early with error details.
+        - If the predicted query is identical or SQL-equivalent to the gold query,
+          results are considered correct without execution.
+        - Otherwise, both queries are executed and their results compared using
+          multiple metrics.
+    """
+    gold_df, gold_runtime, gold_error_msg = run_query(gold_sql, connector, sql_timeout)
+    gold_error = int(bool(gold_error_msg))
+    if gold_error:
+        return SQLExecutionResult(
+            execution_accuracy=0,
+            non_empty_execution_accuracy=0,
+            subset_non_empty_execution_accuracy=0,
+            execution_accuracy_bird=0,
+            non_empty_gold_df=0,
+            gold_sql_runtime=gold_runtime,
+            predicted_sql_runtime=0,
+            pred_to_gold_runtime_ratio=0,
+            gold_error=gold_error,
+            predicted_error=0,
+            gold_df_json="",
+            predicted_df_json="",
+            error_message=gold_error_msg,
+        )
+    non_empty_gold_df = int(not gold_df.empty)
+    if predicted_sql.strip().lower() == gold_sql.strip().lower():
+        return SQLExecutionResult(
+            execution_accuracy=1,
+            non_empty_execution_accuracy=non_empty_gold_df,
+            subset_non_empty_execution_accuracy=non_empty_gold_df,
+            execution_accuracy_bird=1,
+            non_empty_gold_df=non_empty_gold_df,
+            gold_sql_runtime=gold_runtime,
+            predicted_sql_runtime=0,
+            pred_to_gold_runtime_ratio=0,
+            gold_error=0,
+            predicted_error=0,
+            gold_df_json=gold_df.to_json(),
+            predicted_df_json=gold_df.to_json(),
+            error_message="",
+        )
+    try:
+        if sqlglot_optimized_equivalence(gold_sql, predicted_sql):
+            return SQLExecutionResult(
+                execution_accuracy=1,
+                non_empty_execution_accuracy=non_empty_gold_df,
+                subset_non_empty_execution_accuracy=non_empty_gold_df,
+                execution_accuracy_bird=1,
+                non_empty_gold_df=non_empty_gold_df,
+                gold_sql_runtime=gold_runtime,
+                predicted_sql_runtime=0,
+                pred_to_gold_runtime_ratio=0,
+                gold_error=0,
+                predicted_error=0,
+                gold_df_json=gold_df.to_json(),
+                predicted_df_json=gold_df.to_json(),
+                error_message="",
+            )
+    except Exception as e:
+        logger.info(f"Could not check SQL equivalence: {e}")
+    pred_df, pred_runtime, pred_error_msg = run_query(
+        predicted_sql, connector, sql_timeout
+    )
+    pred_error = 1 if pred_error_msg else 0
+    if pred_df is None:
+        return SQLExecutionResult(
+            execution_accuracy=0,
+            non_empty_execution_accuracy=0,
+            subset_non_empty_execution_accuracy=0,
+            execution_accuracy_bird=0,
+            non_empty_gold_df=non_empty_gold_df,
+            gold_sql_runtime=gold_runtime,
+            predicted_sql_runtime=pred_runtime,
+            pred_to_gold_runtime_ratio=(pred_runtime / gold_runtime)
+            if gold_runtime > 0
+            else 0,
+            gold_error=0,
+            predicted_error=pred_error,
+            gold_df_json=gold_df.to_json(),
+            predicted_df_json="",
+            error_message=pred_error_msg,
+        )
+    match, non_empty_match, subset_match = compare_result_dfs(
+        gold_df, pred_df, gold_sql
+    )
+    bird_match = compare_dfs_bird_eval_logic(gold_df, pred_df)
+    return SQLExecutionResult(
+        execution_accuracy=match,
+        non_empty_execution_accuracy=non_empty_match,
+        subset_non_empty_execution_accuracy=subset_match,
+        execution_accuracy_bird=bird_match,
+        non_empty_gold_df=non_empty_gold_df,
+        gold_sql_runtime=gold_runtime,
+        predicted_sql_runtime=pred_runtime,
+        pred_to_gold_runtime_ratio=(pred_runtime / gold_runtime)
+        if gold_runtime > 0
+        else 0,
+        gold_error=0,
+        predicted_error=0,
+        gold_df_json=gold_df.to_json(),
+        predicted_df_json=pred_df.to_json(),
+        error_message=pred_error_msg,
+    )
+def replace_select_clause(
+    source_query: str, target_query: str, dialect: str = "postgres"
+) -> str:
+    """Replaces the SELECT clause of the target SQL query with the SELECT clause from the source SQL query.
+    Args:
+        source_query (str): SQL query whose SELECT clause will be used.
+        target_query (str): SQL query whose SELECT clause will be replaced.
+        dialect (str): SQL dialect for parsing and rendering (default: "postgres").
+    Returns:
+        str: A new SQL query with the SELECT clause of `target_query` replaced by that of `source_query`.
+    Raises:
+        ValueError: If either query is not a valid SELECT statement.
+    Example:
+        >>> replace_select_clause(
+        ...     "SELECT id FROM employees",
+        ...     "SELECT name FROM employees WHERE age > 30"
+        ... )
+        "SELECT id FROM employees WHERE age > 30"
+    """
+    from sqlglot import exp, parse_one
+    if not dialect:
+        dialect = "postgres"
+    # Parse queries using the specified dialect
+    source_ast = parse_one(source_query, read=dialect)
+    target_ast = parse_one(target_query, read=dialect)
+    if not isinstance(source_ast, exp.Select) or not isinstance(target_ast, exp.Select):
+        raise ValueError("Both queries must be valid SELECT statements.")
+    # Replace SELECT expressions in the target with those from the source
+    target_ast.set("expressions", source_ast.expressions)
+    # Return the updated SQL string using the dialect
+    return target_ast.sql(dialect=dialect)
+def extract_sql_from_text(text: str) -> str:
+    """Extracts the first SQL query from the given text.
+    Priority:
+    1. SQL inside fenced blocks like ```sql ... ```
+    2. SQL starting on a new line or after a colon/label
+    3. SQL without semicolon
+    Returns:
+        The SQL query string, or an empty string if not found.
+    """
+    # 1. Look for fenced SQL code block
+    fenced_block_pattern = re.compile(r"```sql\s+(.*?)```", re.IGNORECASE | re.DOTALL)
+    match = fenced_block_pattern.search(text)
+    if match:
+        return match.group(1).strip()
+    # 2. Inline SQL with semicolon
+    sql_keywords = r"(?:SELECT|INSERT|UPDATE|DELETE|WITH)\s+"
+    sql_start = (
+        r"(?:^|\n|:\s*)"  # Start of string, newline, or colon label like "Just run:"
+    )
+    sql_pattern = re.compile(
+        rf"{sql_start}({sql_keywords}.*?;)", re.IGNORECASE | re.DOTALL
+    )
+    match = sql_pattern.search(text)
+    if match:
+        return match.group(1).strip()
+    # 3. Inline SQL without semicolon
+    fallback_pattern = re.compile(
+        rf"{sql_start}({sql_keywords}.*)", re.IGNORECASE | re.DOTALL
+    )
+    fallback_match = fallback_pattern.search(text)
+    if fallback_match:
+        return fallback_match.group(1).strip()
+    return ""
+ALL_DIALECTS = [
+    "Athena",
+    "BigQuery",
+    "ClickHouse",
+    "Databricks",
+    "Doris",
+    "Drill",
+    "Druid",
+    "DuckDB",
+    "Hive",
+    "Materialize",
+    "MySQL",
+    "Oracle",
+    "Postgres",
+    "Presto",
+    "PRQL",
+    "Redshift",
+    "RisingWave",
+    "Snowflake",
+    "Spark",
+    "Spark2",
+    "SQLite",
+    "StarRocks",
+    "Tableau",
+    "Teradata",
+    "Trino",
+    "TSQL",
+]

type_utils.py CHANGED Viewed

@@ -503,9 +503,25 @@ def isoftype(object, typing_type):
     if is_typed_dict(typing_type):
         if not isinstance(object, dict):
             return False
         for key, expected_type in typing_type.__annotations__.items():
-            if key not in object or not isoftype(object[key], expected_type):
-                return False
         return True
     if typing_type == typing.Any:

     if is_typed_dict(typing_type):
         if not isinstance(object, dict):
             return False
+        # Only support total=True, check each field
         for key, expected_type in typing_type.__annotations__.items():
+            # Check if field is Optional (Union with None)
+            is_optional = (
+                hasattr(expected_type, "__origin__")
+                and expected_type.__origin__ is Union
+                and type(None) in expected_type.__args__
+            )
+            if key not in object:
+                # Field is missing - only allowed if it's Optional
+                if not is_optional:
+                    return False
+            else:
+                # Field is present - check type
+                if not isoftype(object[key], expected_type):
+                    return False
         return True
     if typing_type == typing.Any:

types.py CHANGED Viewed

@@ -6,8 +6,52 @@ Text = NewType("Text", str)
 Number = NewType("Number", Union[float, int])
-class Turn(TypedDict):
-    role: Literal["system", "user", "agent"]
     content: Text
@@ -18,7 +62,12 @@ class RagResponse(TypedDict):
     is_answerable: bool
-Dialog = NewType("Dialog", List[Turn])
 class Image(TypedDict):
@@ -52,36 +101,17 @@ class SQLDatabase(TypedDict):
     data: Optional[Dict[str, Dict]]
-class JsonSchema:
-    @classmethod
-    def __verify_type__(cls, object):
-        if not isinstance(object, dict):
-            return False
-        import jsonschema_rs
-        jsonschema_rs.meta.validate(object)
-        return True
-class Tool(TypedDict):
-    name: str
-    description: str
-    parameters: JsonSchema
-class ToolCall(TypedDict):
-    name: str
-    arguments: Dict[str, Any]
 register_type(Text)
 register_type(Number)
-register_type(Turn)
 register_type(Dialog)
 register_type(Table)
 register_type(Audio)
 register_type(Image)
 register_type(Video)
 register_type(Document)
 register_type(MultiDocument)
 register_type(RagResponse)

 Number = NewType("Number", Union[float, int])
+class JsonSchema:
+    @classmethod
+    def __verify_type__(cls, object):
+        if not isinstance(object, dict):
+            return False
+        import jsonschema_rs
+        jsonschema_rs.meta.validate(object)
+        return True
+class Tool(TypedDict):
+    # Original fields
+    name: str
+    description: str
+    parameters: JsonSchema
+    # LiteLLM extension
+    type: Optional[Literal["function"]]
+class ToolCall(TypedDict):
+    name: str
+    arguments: Dict[str, Any]
+class ToolCallContext(TypedDict):
+    id: str
+    type: Literal["function"]
+    function: ToolCall
+class ToolCallTurn(TypedDict):
+    role: Literal["assistant"]
+    content: Optional[str]
+    tool_calls: List[ToolCallContext]
+class ToolOutputTurn(TypedDict):
+    role: Literal["tool"]
+    tool_call_id: str
+    name: str
+    content: str
+class TextTurn(TypedDict):
+    role: Literal["system", "user", "agent", "assistant"]
     content: Text
     is_answerable: bool
+Dialog = NewType("Dialog", List[Union[TextTurn, ToolCallTurn, ToolOutputTurn]])
+class Conversation(TypedDict):
+    id: str
+    dialog: Dialog
 class Image(TypedDict):
     data: Optional[Dict[str, Dict]]
 register_type(Text)
 register_type(Number)
+register_type(TextTurn)
+register_type(ToolCallTurn)
+register_type(ToolOutputTurn)
 register_type(Dialog)
 register_type(Table)
 register_type(Audio)
 register_type(Image)
 register_type(Video)
+register_type(Conversation)
 register_type(Document)
 register_type(MultiDocument)
 register_type(RagResponse)

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.24.0"


1	+ version = "1.25.0"