Upload folder using huggingface_hub
Browse files- README.md +5 -5
- api.py +2 -2
- metrics.py +14 -0
- struct_data_operators.py +21 -0
- templates.py +6 -3
- text_utils.py +1 -1
- version.py +1 -1
README.md
CHANGED
|
@@ -40,11 +40,11 @@ https://github.com/IBM/unitxt/assets/23455264/baef9131-39d4-4164-90b2-05da52919f
|
|
| 40 |
|
| 41 |
### π¦ Currently on Unitxt Catalog
|
| 42 |
|
| 43 |
-

|
| 44 |
+

|
| 45 |
+

|
| 46 |
+

|
| 47 |
+

|
| 48 |
|
| 49 |
### π¦ Run Unitxt Exploration Dashboard
|
| 50 |
|
api.py
CHANGED
|
@@ -145,8 +145,8 @@ def _source_to_dataset(
|
|
| 145 |
cache_dir = dir_to_be_deleted if not use_cache else None
|
| 146 |
ds_builder = UnitxtDataset(
|
| 147 |
dataset_name="unitxt",
|
| 148 |
-
config_name="recipe-" + short_hex_hash(source
|
| 149 |
-
hash=hash(source
|
| 150 |
version=constants.version,
|
| 151 |
cache_dir=cache_dir,
|
| 152 |
)
|
|
|
|
| 145 |
cache_dir = dir_to_be_deleted if not use_cache else None
|
| 146 |
ds_builder = UnitxtDataset(
|
| 147 |
dataset_name="unitxt",
|
| 148 |
+
config_name="recipe-" + short_hex_hash(repr(source)),
|
| 149 |
+
hash=hash(repr(source)),
|
| 150 |
version=constants.version,
|
| 151 |
cache_dir=cache_dir,
|
| 152 |
)
|
metrics.py
CHANGED
|
@@ -3355,6 +3355,8 @@ class CustomF1(GlobalMetric):
|
|
| 3355 |
|
| 3356 |
|
| 3357 |
class NER(CustomF1):
|
|
|
|
|
|
|
| 3358 |
prediction_type = List[Tuple[str, str]]
|
| 3359 |
|
| 3360 |
def get_element_group(self, element, additional_input):
|
|
@@ -3364,6 +3366,18 @@ class NER(CustomF1):
|
|
| 3364 |
return str(element)
|
| 3365 |
|
| 3366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3367 |
def normalize_answer(s):
|
| 3368 |
"""Lower text and remove punctuation, articles and extra whitespace."""
|
| 3369 |
|
|
|
|
| 3355 |
|
| 3356 |
|
| 3357 |
class NER(CustomF1):
|
| 3358 |
+
"""F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
|
| 3359 |
+
|
| 3360 |
prediction_type = List[Tuple[str, str]]
|
| 3361 |
|
| 3362 |
def get_element_group(self, element, additional_input):
|
|
|
|
| 3366 |
return str(element)
|
| 3367 |
|
| 3368 |
|
| 3369 |
+
class KeyValueExtraction(CustomF1):
|
| 3370 |
+
"""F1 Metrics that receives as input a list of (Key,Value) pairs."""
|
| 3371 |
+
|
| 3372 |
+
prediction_type = List[Tuple[str, str]]
|
| 3373 |
+
|
| 3374 |
+
def get_element_group(self, element, additional_input):
|
| 3375 |
+
return element[0]
|
| 3376 |
+
|
| 3377 |
+
def get_element_representation(self, element, additional_input):
|
| 3378 |
+
return str(element)
|
| 3379 |
+
|
| 3380 |
+
|
| 3381 |
def normalize_answer(s):
|
| 3382 |
"""Lower text and remove punctuation, articles and extra whitespace."""
|
| 3383 |
|
struct_data_operators.py
CHANGED
|
@@ -23,6 +23,7 @@ For key-value pairs, expected input format is:
|
|
| 23 |
{"key1": "value1", "key2": value2, "key3": "value3"}
|
| 24 |
"""
|
| 25 |
|
|
|
|
| 26 |
import json
|
| 27 |
import random
|
| 28 |
from abc import ABC, abstractmethod
|
|
@@ -31,12 +32,14 @@ from typing import (
|
|
| 31 |
Dict,
|
| 32 |
List,
|
| 33 |
Optional,
|
|
|
|
| 34 |
)
|
| 35 |
|
| 36 |
import pandas as pd
|
| 37 |
|
| 38 |
from .augmentors import TypeDependentAugmentor
|
| 39 |
from .dict_utils import dict_get
|
|
|
|
| 40 |
from .operators import FieldOperator, InstanceOperator
|
| 41 |
from .random_utils import new_random_generator
|
| 42 |
from .serializers import ImageSerializer, TableSerializer
|
|
@@ -1019,3 +1022,21 @@ class ShuffleColumnsNames(TypeDependentAugmentor):
|
|
| 1019 |
random.shuffle(shuffled_header)
|
| 1020 |
|
| 1021 |
return {"header": shuffled_header, "rows": table["rows"]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
{"key1": "value1", "key2": value2, "key3": "value3"}
|
| 24 |
"""
|
| 25 |
|
| 26 |
+
import ast
|
| 27 |
import json
|
| 28 |
import random
|
| 29 |
from abc import ABC, abstractmethod
|
|
|
|
| 32 |
Dict,
|
| 33 |
List,
|
| 34 |
Optional,
|
| 35 |
+
Tuple,
|
| 36 |
)
|
| 37 |
|
| 38 |
import pandas as pd
|
| 39 |
|
| 40 |
from .augmentors import TypeDependentAugmentor
|
| 41 |
from .dict_utils import dict_get
|
| 42 |
+
from .error_utils import UnitxtWarning
|
| 43 |
from .operators import FieldOperator, InstanceOperator
|
| 44 |
from .random_utils import new_random_generator
|
| 45 |
from .serializers import ImageSerializer, TableSerializer
|
|
|
|
| 1022 |
random.shuffle(shuffled_header)
|
| 1023 |
|
| 1024 |
return {"header": shuffled_header, "rows": table["rows"]}
|
| 1025 |
+
|
| 1026 |
+
|
| 1027 |
+
class JsonStrToListOfKeyValuePairs(FieldOperator):
|
| 1028 |
+
def process_value(self, text: str) -> List[Tuple[str, str]]:
|
| 1029 |
+
text = text.replace("null", "None")
|
| 1030 |
+
|
| 1031 |
+
try:
|
| 1032 |
+
dict_value = ast.literal_eval(text)
|
| 1033 |
+
except Exception as e:
|
| 1034 |
+
UnitxtWarning(
|
| 1035 |
+
f"Unable to convert input text to json format in JsonStrToListOfKeyValuePairs due to {e}. Text: {text}"
|
| 1036 |
+
)
|
| 1037 |
+
dict_value = {}
|
| 1038 |
+
return [
|
| 1039 |
+
(str(key), str(value))
|
| 1040 |
+
for key, value in dict_value.items()
|
| 1041 |
+
if value is not None
|
| 1042 |
+
]
|
templates.py
CHANGED
|
@@ -533,7 +533,8 @@ class MultipleChoiceTemplate(InputFormatTemplate):
|
|
| 533 |
input and reference dictionaries.
|
| 534 |
target_field (str): The key under which the correct choice is stored in the
|
| 535 |
reference dictionary (can be integer index or textual label).
|
| 536 |
-
choices_separator (str): A string used to join formatted
|
|
|
|
| 537 |
source_choice_format (str): A Python format string used for displaying each choice
|
| 538 |
in the input fields (e.g. "{choice_numeral}. {choice_text}").
|
| 539 |
target_choice_format (str): A Python format string used for displaying each choice
|
|
@@ -544,8 +545,10 @@ class MultipleChoiceTemplate(InputFormatTemplate):
|
|
| 544 |
set with `shuffle_choices_seed`.
|
| 545 |
shuffle_choices_seed (int, optional): If provided, the choices are shuffled with
|
| 546 |
this fixed integer seed for reproducibility.
|
| 547 |
-
sort_choices_by_length (bool): If True, sorts choices
|
| 548 |
-
|
|
|
|
|
|
|
| 549 |
reverse_choices (bool): If True, reverses the order of the choices after any
|
| 550 |
sorting has been applied. Defaults to False to preserve backward compatibility.
|
| 551 |
"""
|
|
|
|
| 533 |
input and reference dictionaries.
|
| 534 |
target_field (str): The key under which the correct choice is stored in the
|
| 535 |
reference dictionary (can be integer index or textual label).
|
| 536 |
+
choices_separator (str): A string used to join formatted
|
| 537 |
+
choices (e.g. ", ").
|
| 538 |
source_choice_format (str): A Python format string used for displaying each choice
|
| 539 |
in the input fields (e.g. "{choice_numeral}. {choice_text}").
|
| 540 |
target_choice_format (str): A Python format string used for displaying each choice
|
|
|
|
| 545 |
set with `shuffle_choices_seed`.
|
| 546 |
shuffle_choices_seed (int, optional): If provided, the choices are shuffled with
|
| 547 |
this fixed integer seed for reproducibility.
|
| 548 |
+
sort_choices_by_length (bool): If True, sorts choices
|
| 549 |
+
by their length (ascending).
|
| 550 |
+
sort_choices_alphabetically (bool): If True, sorts choices
|
| 551 |
+
in alphabetical order.
|
| 552 |
reverse_choices (bool): If True, reverses the order of the choices after any
|
| 553 |
sorting has been applied. Defaults to False to preserve backward compatibility.
|
| 554 |
"""
|
text_utils.py
CHANGED
|
@@ -232,7 +232,7 @@ def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
|
|
| 232 |
|
| 233 |
# d1 = re.sub(r"(\n+)", r'"\1"', str(d))
|
| 234 |
d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
|
| 235 |
-
if "\\n" in d1:
|
| 236 |
d1 = f'"{d1}"'
|
| 237 |
return [d1]
|
| 238 |
|
|
|
|
| 232 |
|
| 233 |
# d1 = re.sub(r"(\n+)", r'"\1"', str(d))
|
| 234 |
d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
|
| 235 |
+
if "\\n" in d1 or d1 == "":
|
| 236 |
d1 = f'"{d1}"'
|
| 237 |
return [d1]
|
| 238 |
|
version.py
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
version = "1.
|
|
|
|
| 1 |
+
version = "1.18.0"
|