Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on May 13

Commit

5b41acf

verified ·

1 Parent(s): 66630b0

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

api.py +10 -8
artifact.py +11 -2
dataset.py +0 -1
dataset_utils.py +8 -5
inference.py +86 -29
metric.py +0 -1
metrics.py +76 -32
operators.py +47 -0
serializers.py +1 -6
struct_data_operators.py +21 -1
tool_calling.py +0 -119
type_utils.py +15 -3
types.py +12 -6
version.py +1 -1

api.py CHANGED Viewed

@@ -37,12 +37,11 @@ def short_hex_hash(value, length=8):
     return h[:length]
-def _get_recipe_from_query(dataset_query: str) -> DatasetRecipe:
-    dataset_query = dataset_query.replace("sys_prompt", "instruction")
     try:
-        dataset_stream, _ = fetch_artifact(dataset_query)
     except:
-        dataset_stream = get_dataset_artifact(dataset_query)
     return dataset_stream
@@ -82,14 +81,15 @@ def load_recipe(dataset_query: Optional[str] = None, **kwargs) -> DatasetRecipe:
     if isinstance(dataset_query, (DatasetRecipe, Benchmark)):
         return dataset_query
-    _verify_dataset_args(dataset_query, kwargs)
     if dataset_query:
-        recipe = _get_recipe_from_query(dataset_query)
-    if kwargs:
         recipe = _get_recipe_from_dict(kwargs)
     return recipe
@@ -187,6 +187,8 @@ def load_dataset(
     Alternatively, dataset is loaded from a provided card based on explicitly
     given parameters.
     Args:
         dataset_query (str, optional):
             A string query which specifies a dataset to load from

     return h[:length]
+def _get_recipe_from_query(dataset_query: str, overwrite_kwargs: Optional[Dict[str, Any]]=None) -> DatasetRecipe:
     try:
+        dataset_stream, _ = fetch_artifact(dataset_query, overwrite_kwargs=overwrite_kwargs)
     except:
+        dataset_stream = get_dataset_artifact(dataset_query, overwrite_kwargs=overwrite_kwargs)
     return dataset_stream
     if isinstance(dataset_query, (DatasetRecipe, Benchmark)):
         return dataset_query
     if dataset_query:
+        recipe = _get_recipe_from_query(dataset_query, kwargs)
+    elif kwargs:
         recipe = _get_recipe_from_dict(kwargs)
+    else:
+        raise UnitxtError("Specify either dataset recipe string artifact name or recipe args.")
     return recipe
     Alternatively, dataset is loaded from a provided card based on explicitly
     given parameters.
+    If both are given, then the textual recipe is loaded with the key word args overriding the textual recipe args.
     Args:
         dataset_query (str, optional):
             A string query which specifies a dataset to load from

artifact.py CHANGED Viewed

@@ -22,7 +22,7 @@ from .parsing_utils import (
     separate_inside_and_outside_square_brackets,
 )
 from .settings_utils import get_constants, get_settings
-from .text_utils import camel_to_snake_case, is_camel_case
 from .type_utils import isoftype, issubtype
 from .utils import (
     artifacts_json_cache,
@@ -369,6 +369,10 @@ class Artifact(Dataclass):
         data = self.to_dict()
         return json_dump(data)
     def serialize(self):
         if self.__id__ is not None:
             return self.__id__
@@ -528,7 +532,7 @@ class UnitxtArtifactNotFoundError(UnitxtError):
         super().__init__(msg)
-def fetch_artifact(artifact_rep) -> Tuple[Artifact, Union[AbstractCatalog, None]]:
     """Loads an artifict from one of possible representations.
     (1) If artifact representation is already an Artifact object, return it.
@@ -553,6 +557,11 @@ def fetch_artifact(artifact_rep) -> Tuple[Artifact, Union[AbstractCatalog, None]
         name, _ = separate_inside_and_outside_square_brackets(artifact_rep)
         if is_name_legal_for_catalog(name):
             catalog, artifact_rep, args = get_catalog_name_and_args(name=artifact_rep)
             artifact_to_return = catalog.get_with_overwrite(
                 artifact_rep, overwrite_args=args
             )

     separate_inside_and_outside_square_brackets,
 )
 from .settings_utils import get_constants, get_settings
+from .text_utils import camel_to_snake_case, is_camel_case, print_dict_as_yaml
 from .type_utils import isoftype, issubtype
 from .utils import (
     artifacts_json_cache,
         data = self.to_dict()
         return json_dump(data)
+    def to_yaml(self):
+        data = self.to_dict()
+        return print_dict_as_yaml(data)
     def serialize(self):
         if self.__id__ is not None:
             return self.__id__
         super().__init__(msg)
+def fetch_artifact(artifact_rep, overwrite_kwargs: Optional[Dict[str, Any]]=None) -> Tuple[Artifact, Union[AbstractCatalog, None]]:
     """Loads an artifict from one of possible representations.
     (1) If artifact representation is already an Artifact object, return it.
         name, _ = separate_inside_and_outside_square_brackets(artifact_rep)
         if is_name_legal_for_catalog(name):
             catalog, artifact_rep, args = get_catalog_name_and_args(name=artifact_rep)
+            if overwrite_kwargs is not None:
+                if args is None:
+                    args = overwrite_kwargs
+                else:
+                    args.update(overwrite_kwargs)
             artifact_to_return = catalog.get_with_overwrite(
                 artifact_rep, overwrite_args=args
             )

dataset.py CHANGED Viewed

@@ -68,7 +68,6 @@ from .system_prompts import __file__ as _
 from .task import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
-from .tool_calling import __file__ as _
 from .type_utils import __file__ as _
 from .types import __file__ as _
 from .utils import __file__ as _

 from .task import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
 from .types import __file__ as _
 from .utils import __file__ as _

dataset_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from json.decoder import JSONDecodeError
 from .artifact import Artifact, UnitxtArtifactNotFoundError, fetch_artifact
 from .logging_utils import get_logger
@@ -11,19 +12,19 @@ logger = get_logger()
 settings = get_settings()
-def fetch(artifact_name):
     try:
-        artifact, _ = fetch_artifact(artifact_name)
         return artifact
     except (UnitxtArtifactNotFoundError, JSONDecodeError):
         return None
-def parse(query: str):
     return parse_key_equals_value_string_to_dict(query)
-def get_dataset_artifact(dataset):
     if isinstance(dataset, DatasetRecipe):
         return dataset
     assert isinstance(
@@ -31,10 +32,12 @@ def get_dataset_artifact(dataset):
     ), "dataset should be string description of recipe, or recipe object."
     _reset_env_local_catalogs()
     register_all_artifacts()
-    recipe = fetch(dataset)
     if recipe is None:
         args = parse(dataset)
         if "__type__" not in args:
             args["__type__"] = settings.default_recipe
         recipe = Artifact.from_dict(args)
     return recipe

 from json.decoder import JSONDecodeError
+from typing import Any, Dict, Optional
 from .artifact import Artifact, UnitxtArtifactNotFoundError, fetch_artifact
 from .logging_utils import get_logger
 settings = get_settings()
+def fetch(artifact_name: str, overwrite_kwargs: Optional[Dict[str, Any]]=None):
     try:
+        artifact, _ = fetch_artifact(artifact_name, overwrite_kwargs=overwrite_kwargs)
         return artifact
     except (UnitxtArtifactNotFoundError, JSONDecodeError):
         return None
+def parse(query: str) -> dict:
     return parse_key_equals_value_string_to_dict(query)
+def get_dataset_artifact(dataset, overwrite_kwargs: Optional[Dict[str, Any]]=None):
     if isinstance(dataset, DatasetRecipe):
         return dataset
     assert isinstance(
     ), "dataset should be string description of recipe, or recipe object."
     _reset_env_local_catalogs()
     register_all_artifacts()
+    recipe = fetch(dataset, overwrite_kwargs=overwrite_kwargs)
     if recipe is None:
         args = parse(dataset)
         if "__type__" not in args:
             args["__type__"] = settings.default_recipe
+        if overwrite_kwargs is not None:
+            args.update(overwrite_kwargs)
         recipe = Artifact.from_dict(args)
     return recipe

inference.py CHANGED Viewed

@@ -344,6 +344,8 @@ class InferenceEngine(Artifact):
     def to_tools(self, instance):
         task_data = instance.get("task_data")
         if isinstance(task_data, str):
             task_data = json.loads(task_data)
         if "__tools__" in task_data:
@@ -445,6 +447,8 @@ class HFInferenceEngineBase(
     model: Any = InternalField(default=None, name="Inference object")
     processor: Any = InternalField(default=None, name="Input processor (tokenizer)")
     _requirements_list = {
         "transformers": "Install huggingface package using 'pip install --upgrade transformers",
         "torch": "Install torch, go on PyTorch website for mode details.",
@@ -655,8 +659,6 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
     truncation: bool = True
     padding_side: str = "left"  # for decoder only models
-    chat_kwargs_dict: dict = {}
     def _init_processor(self):
         from transformers import AutoTokenizer
@@ -712,10 +714,9 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
             trust_remote_code=True,
             **model_args,
         )
-        if self.device_map is None:
-            self.model.to(self.device)
     def prepare_inputs(self, data: Iterable) -> Mapping:
         if isinstance(data[0], list):
             data = self.processor.apply_chat_template(
                 data,
@@ -723,6 +724,7 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
                 add_generation_prompt=True,
                 **self.chat_kwargs_dict,
             )
         if self.processor.pad_token is None:
             self.processor.pad_token_id = self.model.config.eos_token_id[0]
@@ -733,6 +735,8 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
             padding=self.padding,
             truncation=self.truncation,
             padding_side=self.padding_side,
         ).to(self.device or self.device_map)
     def _infer_fn(
@@ -755,13 +759,14 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
         """
         all_final_outputs = []  # List to store results from all batches
-        for i in tqdm(
-            range(0, len(dataset), self.batch_size),
             desc=f"Running inference in batches of {self.batch_size}",
         ):
             # Get the current batch
-            batch_data = dataset[i : i + self.batch_size]
-            batch_sources = [instance["source"] for instance in batch_data]
             # --- Process the current batch ---
             # 1. Tokenize inputs for the batch
@@ -800,7 +805,7 @@ class HFAutoModelInferenceEngine(HFInferenceEngineBase):
                         j
                     ],  # Output for the j-th item in the batch
                     output_tokens=len(string_tokens_batch[j]),
-                    inp=batch_data[j]["source"],  # Original input for the j-th item
                     inp_tokens=len(tokenized_inputs.encodings[j].tokens)
                     if tokenized_inputs.encodings is not None
                     else None,
@@ -1840,15 +1845,26 @@ class OpenAiInferenceEngine(
     @run_with_imap
     def _get_chat_completion(self, instance, return_meta_data):
         import openai
         messages = self.to_messages(instance)
         try:
             response = self.client.chat.completions.create(
                 messages=messages,
                 model=self.get_client_model_name(),
                 **self._get_completion_kwargs(),
             )
-            prediction = response.choices[0].message.content
             return self.get_return_object(prediction, response, return_meta_data)
         # catch in case of content_filtering failure
         except openai.BadRequestError as e:
@@ -2742,14 +2758,37 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
         # images as SDK allows sending only one image per message.
         return [messages]
     def _handle_async_requests(
         self,
-        messages: List[List[Dict[str, Any]]],
         params: Dict[str, Any],
     ) -> List[Dict[str, Any]]:
         async def handle_async_requests(start_idx, end_idx):
             coroutines = [
-                self._model.achat(messages=messages[idx], params=params)
                 for idx in range(start_idx, end_idx)
             ]
             batch_results = await asyncio.gather(*coroutines)
@@ -2758,10 +2797,10 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
         loop = asyncio.get_event_loop()
         results = []
-        for batch_idx in range(0, len(messages), self.concurrency_limit):
             batch_results = loop.run_until_complete(
                 handle_async_requests(
-                    batch_idx, min(batch_idx + self.concurrency_limit, len(messages))
                 )
             )
             results.extend(batch_results)
@@ -2783,25 +2822,43 @@ class WMLInferenceEngineChat(WMLInferenceEngineBase, WMLChatParamsMixin):
             output_type = "message"
             params["logprobs"] = False
-        indexed_messages = [
-            (i, message)
             for i in range(len(dataset))
             for message in self.to_messages(dataset[i])
         ]
-        results = self._handle_async_requests(
-            [msg[1] for msg in indexed_messages], params
-        )
-        return [
-            self.get_return_object(
-                result["choices"][0][output_type]["content"],
-                result,
-                dataset[idx[0]]["source"],
-                return_meta_data,
             )
-            for result, idx in zip(results, indexed_messages)
-        ]
     def get_return_object(self, predict_result, result, input_text, return_meta_data):
         if return_meta_data:
@@ -3439,7 +3496,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         "aws": LiteLLMInferenceEngine,
         "ollama": OllamaInferenceEngine,
         "bam": IbmGenAiInferenceEngine,
-        "watsonx-sdk": WMLInferenceEngine,
         "rits": RITSInferenceEngine,
         "azure": LiteLLMInferenceEngine,
         "vertex-ai": LiteLLMInferenceEngine,

     def to_tools(self, instance):
         task_data = instance.get("task_data")
+        if task_data is None:
+            return None
         if isinstance(task_data, str):
             task_data = json.loads(task_data)
         if "__tools__" in task_data:
     model: Any = InternalField(default=None, name="Inference object")
     processor: Any = InternalField(default=None, name="Input processor (tokenizer)")
+    chat_kwargs_dict: dict = {}
     _requirements_list = {
         "transformers": "Install huggingface package using 'pip install --upgrade transformers",
         "torch": "Install torch, go on PyTorch website for mode details.",
     truncation: bool = True
     padding_side: str = "left"  # for decoder only models
     def _init_processor(self):
         from transformers import AutoTokenizer
             trust_remote_code=True,
             **model_args,
         )
     def prepare_inputs(self, data: Iterable) -> Mapping:
+        tokenizer_kargs = {}
         if isinstance(data[0], list):
             data = self.processor.apply_chat_template(
                 data,
                 add_generation_prompt=True,
                 **self.chat_kwargs_dict,
             )
+            tokenizer_kargs["add_special_tokens"] = False
         if self.processor.pad_token is None:
             self.processor.pad_token_id = self.model.config.eos_token_id[0]
             padding=self.padding,
             truncation=self.truncation,
             padding_side=self.padding_side,
+            **tokenizer_kargs
         ).to(self.device or self.device_map)
     def _infer_fn(
         """
         all_final_outputs = []  # List to store results from all batches
+        for batch in tqdm(
+            batched(dataset, self.batch_size),
             desc=f"Running inference in batches of {self.batch_size}",
+            total=len(dataset) // self.batch_size,
         ):
             # Get the current batch
+            batch_sources = [instance["source"] for instance in batch]
             # --- Process the current batch ---
             # 1. Tokenize inputs for the batch
                         j
                     ],  # Output for the j-th item in the batch
                     output_tokens=len(string_tokens_batch[j]),
+                    inp=batch[j]["source"],  # Original input for the j-th item
                     inp_tokens=len(tokenized_inputs.encodings[j].tokens)
                     if tokenized_inputs.encodings is not None
                     else None,
     @run_with_imap
     def _get_chat_completion(self, instance, return_meta_data):
         import openai
+        tools = self.to_tools(instance)
         messages = self.to_messages(instance)
         try:
             response = self.client.chat.completions.create(
                 messages=messages,
+                tools=tools,
                 model=self.get_client_model_name(),
                 **self._get_completion_kwargs(),
+#                tool_choice="auto"
             )
+            if tools is None:
+                prediction = response.choices[0].message.content
+            else:
+                try:
+                    func_call = response.choices[0].message.tool_calls[0].function
+                    prediction = f'{{"name": "{func_call.name}", "arguments": {func_call.arguments}}}'
+                except:
+                    prediction = response.choices[0].message.content or ""
             return self.get_return_object(prediction, response, return_meta_data)
         # catch in case of content_filtering failure
         except openai.BadRequestError as e:
         # images as SDK allows sending only one image per message.
         return [messages]
+    def to_tools(
+        self,
+        instance: Dict[str, Any]
+    ) -> Dict[str, Union[Optional[List[Dict[str, str]]], Optional[Dict[str, str]]]]:
+        """watsonx.ai chat also allows specifying which tools models must use."""
+        task_data = instance.get("task_data")
+        if task_data is None:
+            return {"tools": None, "tool_choice": None}
+        if isinstance(task_data, str):
+            task_data = json.loads(task_data)
+        if "__tools__" in task_data:
+            tools: List[Dict[str, str]] = task_data["__tools__"]
+            tool_choice: Optional[Dict[str, str]] = task_data.get("__tool_choice__")
+            return {"tools": tools, "tool_choice": tool_choice}
+        return {"tools": None, "tool_choice": None}
     def _handle_async_requests(
         self,
+        data: List[Dict[str, Any]],
         params: Dict[str, Any],
     ) -> List[Dict[str, Any]]:
         async def handle_async_requests(start_idx, end_idx):
             coroutines = [
+                self._model.achat(
+                    messages=data[idx]["msg"],
+                    params=params,
+                    tools=data[idx]["tools"]["tools"],
+                    tool_choice=data[idx]["tools"]["tool_choice"],
+                )
                 for idx in range(start_idx, end_idx)
             ]
             batch_results = await asyncio.gather(*coroutines)
         loop = asyncio.get_event_loop()
         results = []
+        for batch_idx in range(0, len(data), self.concurrency_limit):
             batch_results = loop.run_until_complete(
                 handle_async_requests(
+                    batch_idx, min(batch_idx + self.concurrency_limit, len(data))
                 )
             )
             results.extend(batch_results)
             output_type = "message"
             params["logprobs"] = False
+        data = [
+            {
+                "idx": i,
+                "msg": message,
+                "tools": self.to_tools(dataset[i]),
+            }
             for i in range(len(dataset))
             for message in self.to_messages(dataset[i])
         ]
+        responses = self._handle_async_requests(data, params)
+        results = []
+        for inp, response in zip(data, responses):
+            idx = inp["idx"]
+            tool_call = data[idx]["tools"]["tools"] is not None
+            output = response["choices"][0][output_type]
+            if tool_call:
+                if "tool_calls" in output:
+                    func = output["tool_calls"][0]["function"]
+                    prediction = f'{{"name": "{func["name"]}", "arguments": {func["arguments"]}}}'
+                else:
+                    prediction = output["content"]
+            else:
+                prediction = output["content"]
+            results.append(
+                self.get_return_object(
+                    prediction,
+                    response,
+                    str(inp),
+                    return_meta_data,
+                )
             )
+        return results
     def get_return_object(self, predict_result, result, input_text, return_meta_data):
         if return_meta_data:
         "aws": LiteLLMInferenceEngine,
         "ollama": OllamaInferenceEngine,
         "bam": IbmGenAiInferenceEngine,
+        "watsonx-sdk": WMLInferenceEngineChat,
         "rits": RITSInferenceEngine,
         "azure": LiteLLMInferenceEngine,
         "vertex-ai": LiteLLMInferenceEngine,

metric.py CHANGED Viewed

@@ -65,7 +65,6 @@ from .system_prompts import __file__ as _
 from .task import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
-from .tool_calling import __file__ as _
 from .type_utils import __file__ as _
 from .types import __file__ as _
 from .utils import __file__ as _

 from .task import __file__ as _
 from .templates import __file__ as _
 from .text_utils import __file__ as _
 from .type_utils import __file__ as _
 from .types import __file__ as _
 from .utils import __file__ as _

metrics.py CHANGED Viewed

@@ -63,7 +63,6 @@ from .operators import ArtifactFetcherMixin, Copy, Set
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
-from .tool_calling import convert_chat_api_format_to_tool
 from .type_utils import Type, isoftype, parse_type_string, to_type_string
 from .types import ToolCall
 from .utils import deep_copy, recursive_copy, retry_connection_with_exponential_backoff
@@ -789,74 +788,92 @@ class F1Fast(MapReduceMetric[str, Tuple[int, int]]):
         return result
 class ToolCallingMetric(ReductionInstanceMetric[str, Dict[str, float]]):
     main_score = "exact_match"
     reduction = MeanReduction()
     prediction_type = ToolCall
     def map(
         self, prediction: ToolCall, references: List[ToolCall], task_data: Dict[str, Any]
     ) -> Dict[str, float]:
         exact_match = float(
-            str(prediction) in [str(reference) for reference in references]
         )
-        tool_choice = float(
             str(prediction["name"]) in [str(reference["name"]) for reference in references]
         )
-        parameter_choice = 0.0
         for reference in references:
-            if len(prediction["arguments"]) > 0:
                 score = len(set(prediction["arguments"]).intersection(set(reference["arguments"]))) / len(set(prediction["arguments"]))
-            else:
                 score = 1.0
-            if score > parameter_choice:
-                parameter_choice = score
-        parameter_values = 0.0
         for reference in references:
             value_matches = 0
             for key, val in prediction["arguments"].items():
                 try:
-                    if val in reference["arguments"][key] or reference["arguments"][key] in val:
                         value_matches += 1
                 except:
                     pass
             if len(prediction["arguments"]) > 0:
                 score = value_matches / len(prediction["arguments"])
             else:
                 score = 1.0
-            if score > parameter_values:
-                parameter_values = score
         for tool in task_data["__tools__"]:
-            tool = convert_chat_api_format_to_tool(tool)
-            tool_params_types = {}
-            for param in tool["parameters"]:
-                tool_params_types[param["name"]] = param["type"]
-            correct_parameters_types = 0
-            for key, value in prediction["arguments"].items():
-                typing_type = tool_params_types.get(key, Any)
-                if isoftype(value, typing_type):
-                    correct_parameters_types += 1
-            if len(prediction["arguments"]) > 0:
-                parameters_types = correct_parameters_types / len(prediction["arguments"])
-            else:
-                parameters_types = 1.0
         return {
             self.main_score: exact_match,
-            "tool_choice": tool_choice,
-            "parameter_choice": parameter_choice,
-            "parameters_types": parameters_types,
-            "parameter_values": parameter_values
         }
@@ -3499,7 +3516,7 @@ class CustomF1(GlobalMetric):
 class KeyValueExtraction(GlobalMetric):
     prediction_type = Dict[str, str]
     metric: Metric
-    single_reference_per_prediction = True
     main_score = ""
     def prepare(self):
@@ -3575,6 +3592,33 @@ class KeyValueExtraction(GlobalMetric):
         return result
 class NER(CustomF1):
     """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""

 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
 from .type_utils import Type, isoftype, parse_type_string, to_type_string
 from .types import ToolCall
 from .utils import deep_copy, recursive_copy, retry_connection_with_exponential_backoff
         return result
 class ToolCallingMetric(ReductionInstanceMetric[str, Dict[str, float]]):
+    """Compares each predicted tool call with list of references tool call."""
     main_score = "exact_match"
     reduction = MeanReduction()
     prediction_type = ToolCall
+    _requirements_list = ["jsonschema-rs"]
+    def prepare(self):
+        super().prepare()
+        import jsonschema_rs
+        self._schema = jsonschema_rs
     def map(
         self, prediction: ToolCall, references: List[ToolCall], task_data: Dict[str, Any]
     ) -> Dict[str, float]:
         exact_match = float(
+            json.dumps(prediction, sort_keys=True) in [json.dumps(reference, sort_keys=True) for reference in references]
         )
+        tool_name_accuracy = float(
             str(prediction["name"]) in [str(reference["name"]) for reference in references]
         )
+        argument_name_recall = 0.0
         for reference in references:
+            if len(reference["arguments"]) > 0:
+                score = len(set(prediction["arguments"]).intersection(set(reference["arguments"]))) / len(set(reference["arguments"]))
+            else:
+                score = 1.0
+            if score > argument_name_recall:
+                argument_name_recall = score
+        argument_name_precision = 0.0
+        for reference in references:
+            if len(prediction["arguments"]) > 0:
                 score = len(set(prediction["arguments"]).intersection(set(reference["arguments"]))) / len(set(prediction["arguments"]))
+            elif len(reference["arguments"]) == 0:
                 score = 1.0
+            else:
+                score = 0.0
+            if score > argument_name_precision:
+                argument_name_precision = score
+        argument_value_precision = 0.0
         for reference in references:
             value_matches = 0
             for key, val in prediction["arguments"].items():
                 try:
+                    predicted = json.dumps(val, sort_keys=True)
+                    target = json.dumps(reference["arguments"][key], sort_keys=True)
+                    if predicted == target:
                         value_matches += 1
                 except:
                     pass
             if len(prediction["arguments"]) > 0:
                 score = value_matches / len(prediction["arguments"])
             else:
                 score = 1.0
+            if score > argument_value_precision:
+                argument_value_precision = score
+        parameters = None
         for tool in task_data["__tools__"]:
+            if tool["function"]["name"] == prediction["name"]:
+                parameters = tool["function"]["parameters"]
+        if parameters is None:
+            argument_schema_validation = 0.0
+        else:
+            try:
+                self._schema.validate(parameters, prediction["arguments"], )
+                argument_schema_validation = 1.0
+            except self._schema.ValidationError:
+                argument_schema_validation = 0.0
         return {
             self.main_score: exact_match,
+            "tool_name_accuracy": tool_name_accuracy,
+            "argument_name_recall": argument_name_recall,
+            "argument_name_precision": argument_name_precision,
+            "argument_value_precision": argument_value_precision,
+            "argument_schema_validation": argument_schema_validation,
         }
 class KeyValueExtraction(GlobalMetric):
     prediction_type = Dict[str, str]
     metric: Metric
+    single_reference_per_prediction = False
     main_score = ""
     def prepare(self):
         return result
+class  ToolCallKeyValueExtraction(KeyValueExtraction):
+    prediction_type = ToolCall
+    def flatten_dict(self,nested_dict, parent_key="", sep="."):
+        flat_dict = {}
+        for k, v in nested_dict.items():
+            new_key = f"{parent_key}{sep}{k}" if parent_key else k
+            if isinstance(v, list):
+                for e in v:
+                    if isinstance(e,dict):
+                        flat_dict.update(self.flatten_dict(e, new_key, sep=sep))
+            elif isinstance(v, dict):
+                flat_dict.update(self.flatten_dict(v, new_key, sep=sep))
+            else:
+                flat_dict[new_key] = v
+        return flat_dict
+    def compute(
+        self,
+        references: List[List[ToolCall]],
+        predictions: List[ToolCall],
+        task_data: List[Dict],
+    ) -> dict:
+        return super().compute([[ self.flatten_dict(r) for r in ref ] for ref in references],
+                    [ self.flatten_dict(p) for p in predictions],task_data)
 class NER(CustomF1):
     """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""

operators.py CHANGED Viewed

@@ -283,6 +283,53 @@ class Set(InstanceOperator):
             dict_set(instance, key, value)
         return instance
 @deprecation(version="2.0.0", alternative=Set)
 class AddFields(Set):

             dict_set(instance, key, value)
         return instance
+def recursive_key_value_replace(data, target_key, value_map, value_remove=None):
+    """Recursively traverses a data structure (dicts and lists), replaces values of target_key using value_map, and removes values listed in value_remove.
+    Args:
+        data: The data structure (dict or list) to traverse.
+        target_key: The specific key whose value needs to be checked and replaced or removed.
+        value_map: A dictionary mapping old values to new values.
+        value_remove: A list of values to completely remove if found as values of target_key.
+    Returns:
+        The modified data structure. Modification is done in-place.
+    """
+    if value_remove is None:
+        value_remove = []
+    if isinstance(data, dict):
+        keys_to_delete = []
+        for key, value in data.items():
+            if key == target_key:
+                if isinstance(value, list):
+                    data[key] = [
+                        value_map.get(item, item)
+                        for item in value
+                        if not isinstance(item, dict) and item not in value_remove
+                    ]
+                elif isinstance(value, dict):
+                    pass  # Skip or handle dict values if needed
+                elif value in value_remove:
+                    keys_to_delete.append(key)
+                elif value in value_map:
+                    data[key] = value_map[value]
+            else:
+                recursive_key_value_replace(value, target_key, value_map, value_remove)
+        for key in keys_to_delete:
+            del data[key]
+    elif isinstance(data, list):
+        for item in data:
+            recursive_key_value_replace(item, target_key, value_map, value_remove)
+    return data
+class RecursiveReplace(InstanceOperator):
+    key: str
+    map_values: dict
+    remove_values: Optional[list] = None
+    def process(self, instance: Dict[str, Any], stream_name: Optional[str] = None) -> Dict[str, Any]:
+        return recursive_key_value_replace(instance, self.key, self.map_values, self.remove_values)
 @deprecation(version="2.0.0", alternative=Set)
 class AddFields(Set):

serializers.py CHANGED Viewed

@@ -7,7 +7,6 @@ from typing import Any, Dict, List, Union
 from .dataclass import AbstractField, Field
 from .operators import InstanceFieldOperator
 from .settings_utils import get_constants
-from .tool_calling import convert_to_chat_api_format
 from .type_utils import isoftype, to_type_string
 from .types import (
     Dialog,
@@ -168,24 +167,20 @@ class MultiDocumentSerializer(DocumentSerializer):
 class ToolsSerializer(SingleTypeSerializer):
     serialized_type = List[Tool]
-    _requirements_list: List[str] = ["pydantic"]
     def serialize(self, value: List[Tool], instance: Dict[str, Any]) -> str:
         if "__tools__" not in instance:
             instance["__tools__"] = []
         tool = []
         for tool in value:
-            chat_api_tool = convert_to_chat_api_format(tool=tool)
             instance["__tools__"].append(
-                chat_api_tool
             )
-            tool["parameters"] = chat_api_tool["function"]["parameters"]
         return json.dumps(instance["__tools__"], indent=4)
 class ToolCallSerializer(SingleTypeSerializer):
     serialized_type = ToolCall
-    _requirements_list: List[str] = ["pydantic"]
     def serialize(self, value: ToolCall, instance: Dict[str, Any]) -> str:
         return json.dumps(value)

 from .dataclass import AbstractField, Field
 from .operators import InstanceFieldOperator
 from .settings_utils import get_constants
 from .type_utils import isoftype, to_type_string
 from .types import (
     Dialog,
 class ToolsSerializer(SingleTypeSerializer):
     serialized_type = List[Tool]
     def serialize(self, value: List[Tool], instance: Dict[str, Any]) -> str:
         if "__tools__" not in instance:
             instance["__tools__"] = []
         tool = []
         for tool in value:
             instance["__tools__"].append(
+                {"type": "function", "function": tool}
             )
         return json.dumps(instance["__tools__"], indent=4)
 class ToolCallSerializer(SingleTypeSerializer):
     serialized_type = ToolCall
     def serialize(self, value: ToolCall, instance: Dict[str, Any]) -> str:
         return json.dumps(value)

struct_data_operators.py CHANGED Viewed

@@ -43,7 +43,7 @@ from .operators import FieldOperator, InstanceOperator
 from .random_utils import new_random_generator
 from .serializers import ImageSerializer, TableSerializer
 from .type_utils import isoftype
-from .types import Table
 from .utils import recursive_copy
@@ -754,6 +754,26 @@ class LoadJson(FieldOperator):
             return json.loads(value, strict=False)
 class DumpJson(FieldOperator):
     def process_value(self, value: str) -> str:
         return json.dumps(value)

 from .random_utils import new_random_generator
 from .serializers import ImageSerializer, TableSerializer
 from .type_utils import isoftype
+from .types import Table, ToolCall
 from .utils import recursive_copy
             return json.loads(value, strict=False)
+class ToolCallPostProcessor(FieldOperator):
+    failure_value: Any = None
+    allow_failure: bool = False
+    def process_value(self, value: str) -> ToolCall:
+        if self.allow_failure:
+            try:
+                result = json.loads(value)
+            except json.JSONDecodeError:
+                return self.failure_value
+        else:
+            result = json.loads(value, strict=False)
+        if isoftype(result, List[ToolCall]):
+            if len(result) > 1:
+                UnitxtWarning(f"More than one tool returned from model: {result}"   )
+                return self.failure_value
+            return result[0]
+        if not isoftype(result, ToolCall):
+            return self.failure_value
+        return result
 class DumpJson(FieldOperator):
     def process_value(self, value: str) -> str:
         return json.dumps(value)

tool_calling.py DELETED Viewed

@@ -1,119 +0,0 @@
-from typing import Any, Dict, List, Type
-from .operators import FieldOperator
-from .types import Parameter, Tool
-def convert_to_chat_api_format(tool: Tool) -> Dict[str, Any]:
-    from pydantic import create_model
-    field_definitions = {}
-    for param in tool["parameters"]:
-        param_name = param["name"]
-        param_type = param.get("type", Any)
-        field_definitions[param_name] = (param_type, ...)  # ... means required in Pydantic
-    model = create_model(f"{tool['name']}Params", **field_definitions)
-    schema = model.model_json_schema()
-    return {
-        "type": "function",
-        "function": {
-            "name": tool["name"],
-            "description": tool["description"],
-            "parameters": schema
-        }
-    }
-def convert_chat_api_format_to_tool(chat_api_tool: Dict[str, Any]) -> Tool:
-    """Convert a Chat API formatted tool back to the original Tool structure.
-    Args:
-        chat_api_tool: A dictionary representing a tool in Chat API format
-    Returns:
-        A Tool dictionary with name, description, and parameters
-    """
-    # Extract function information
-    function_info = chat_api_tool.get("function", {})
-    name = function_info.get("name", chat_api_tool.get("name", ""))
-    description = function_info.get("description", chat_api_tool.get("description", ""))
-    # Extract parameters from schema
-    parameters: List[Parameter] = []
-    schema = function_info.get("parameters",  chat_api_tool.get("parameters", ""))
-    properties = schema.get("properties", {})
-    for param_name, param_schema in properties.items():
-        # Map JSON schema type to Python type
-        param_type = json_schema_to_python_type(param_schema)
-        parameter: Parameter = {
-            "name": param_name,
-            "type": param_type
-        }
-        parameters.append(parameter)
-    # Construct and return the Tool
-    tool: Tool = {
-        "name": name,
-        "description": description,
-        "parameters": parameters
-    }
-    return tool
-def json_schema_to_python_type(schema: Dict[str, Any]) -> Type:
-    """Convert JSON schema type to Python type."""
-    from typing import Any, Dict, List, Union
-    schema_type = schema.get("type")
-    # Handle simple types
-    simple_types = {
-        "string": str,
-        "integer": int,
-        "number": float,
-        "boolean": bool,
-        "null": type(None)
-    }
-    if schema_type in simple_types:
-        return simple_types[schema_type]
-    # Handle arrays
-    if schema_type == "array":
-        items = schema.get("items", {})
-        if not items:
-            return List[Any]
-        item_type = json_schema_to_python_type(items)
-        return List[item_type]
-    # Handle objects
-    if schema_type == "object":
-        return Dict[str, Any]
-    # Handle unions with anyOf/oneOf
-    if "anyOf" in schema or "oneOf" in schema:
-        union_schemas = schema.get("anyOf", []) or schema.get("oneOf", [])
-        union_types = [json_schema_to_python_type(s) for s in union_schemas]
-        # Use Union for Python 3.9+ or create Union using typing module
-        return Union[tuple(union_types)] if union_types else Any
-    # Handle references (simplified)
-    if "$ref" in schema:
-        # In a real implementation, you'd resolve references
-        return Any
-    # Default to Any for unrecognized schema types
-    return Any
-class ToTool(FieldOperator):
-    def process_value(self, value: Dict[str, Any]) -> Tool:
-        return convert_chat_api_format_to_tool(value)

type_utils.py CHANGED Viewed

@@ -27,7 +27,7 @@ _registered_types = {
 def register_type(new_type):
     assert is_new_type(new_type) or is_typed_dict(
         new_type
-    ), "Can register only typing.NewType or typing.TypedDict"
     _registered_types[new_type.__name__] = new_type
@@ -489,6 +489,9 @@ def isoftype(object, typing_type):
     if not is_type(typing_type):
         raise UnsupportedTypeError(typing_type)
     if typing_type is typing.Type:
         return is_type(object)
@@ -1066,9 +1069,18 @@ def verify_required_schema(
                 f"{class_name} description: {description}"
             ) from e
-        if not isoftype(value, data_type):
             raise ValueError(
-                f"Passed value '{value}' of field '{field_name}' is not "
                 f"of required type: ({to_type_string(data_type)}) in {class_name} ('{id}').\n"
                 f"{class_name} description: {description}"
             )

 def register_type(new_type):
     assert is_new_type(new_type) or is_typed_dict(
         new_type
+    ) or hasattr(new_type, "__verify_type__"), "Can register only typing.NewType or typing.TypedDict or object with __verify_type__ class function"
     _registered_types[new_type.__name__] = new_type
     if not is_type(typing_type):
         raise UnsupportedTypeError(typing_type)
+    if hasattr(typing_type, "__verify_type__"):
+        return typing_type.__verify_type__(object)
     if typing_type is typing.Type:
         return is_type(object)
                 f"{class_name} description: {description}"
             ) from e
+        try:
+            valid = isoftype(value, data_type)
+        except Exception as e:
+            raise ValueError(
+                    f"Passed value {value} of field '{field_name}' is not "
+                    f"of required type: ({to_type_string(data_type)}) in {class_name} ('{id}').\n"
+                    f"{class_name} description: {description}\nReason:\n{e}"
+                ) from e
+        if not valid:
             raise ValueError(
+                f"Passed value {value} of field '{field_name}' is not "
                 f"of required type: ({to_type_string(data_type)}) in {class_name} ('{id}').\n"
                 f"{class_name} description: {description}"
             )

types.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Literal, NewType, Optional, Type, TypedDict, Union
 from .type_utils import register_type
@@ -51,14 +51,20 @@ class SQLDatabase(TypedDict):
     dbms: Optional[str]
     data: Optional[Dict[str, Dict]]
-class Parameter(TypedDict):
-    name: str
-    type: Optional[Type]  # Using actual Python type objects
 class Tool(TypedDict):
     name: str
     description: str
-    parameters: List[Parameter]
 class ToolCall(TypedDict):
     name: str
@@ -76,7 +82,7 @@ register_type(Document)
 register_type(MultiDocument)
 register_type(RagResponse)
 register_type(SQLDatabase)
-register_type(Parameter)
 register_type(Tool)
 register_type(ToolCall)

+from typing import Any, Dict, List, Literal, NewType, Optional, TypedDict, Union
 from .type_utils import register_type
     dbms: Optional[str]
     data: Optional[Dict[str, Dict]]
+class JsonSchema:
+    @classmethod
+    def __verify_type__(cls, object):
+        if not isinstance(object, dict):
+            return False
+        import jsonschema_rs
+        jsonschema_rs.meta.validate(object)
+        return True
 class Tool(TypedDict):
     name: str
     description: str
+    parameters: JsonSchema
 class ToolCall(TypedDict):
     name: str
 register_type(MultiDocument)
 register_type(RagResponse)
 register_type(SQLDatabase)
 register_type(Tool)
+register_type(JsonSchema)
 register_type(ToolCall)

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.22.4"


1	+ version = "1.23.0"