Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Dec 3, 2023

Commit

eee0bf8

1 Parent(s): 43b496d

Upload standard.py with huggingface_hub

Browse files

Files changed (1) hide show

standard.py +81 -17

standard.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from typing import List
 from .card import TaskCard
 from .dataclass import InternalField, OptionalField
 from .formats import ICLFormat
 from .instructions import Instruction
-from .operator import SourceSequntialOperator, StreamingOperator
-from .operators import StreamRefiner
 from .recipe import Recipe
 from .renderers import StandardRenderer
 from .schema import ToUnitxtGroup
@@ -13,21 +14,30 @@ from .splitters import Sampler, SeparateSplit, SpreadSplit
 from .templates import Template
-class BaseRecipe(Recipe, SourceSequntialOperator):
     card: TaskCard
     template: Template = None
     instruction: Instruction = None
     format: ICLFormat = ICLFormat()
     max_train_instances: int = None
     max_validation_instances: int = None
     max_test_instances: int = None
-    train_refiner: StreamRefiner = OptionalField(default_factory=lambda: StreamRefiner(apply_to_streams=["train"]))
-    validation_refiner: StreamRefiner = OptionalField(
-        default_factory=lambda: StreamRefiner(apply_to_streams=["validation"])
-    )
-    test_refiner: StreamRefiner = OptionalField(default_factory=lambda: StreamRefiner(apply_to_streams=["test"]))
     demos_pool_size: int = None
     num_demos: int = 0
@@ -37,6 +47,8 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
     demos_field: str = "demos"
     sampler: Sampler = None
     steps: List[StreamingOperator] = InternalField(default_factory=list)
     def verify(self):
@@ -48,7 +60,31 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
                 )
             if self.demos_pool_size < self.num_demos:
                 raise ValueError(
-                    f"demos_pool_size must be bigger than num_demos={self.num_demos}, Got demos_pool_size={self.demos_pool_size}"
                 )
     def prepare(self):
@@ -56,14 +92,23 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
             self.card.loader,
         ]
         if self.card.preprocess_steps is not None:
             self.steps.extend(self.card.preprocess_steps)
         self.steps.append(self.card.task)
         if self.demos_pool_size is not None:
             self.steps.append(
-                SeparateSplit(
                     from_split=self.demos_taken_from,
                     to_split_names=[self.demos_pool_name, self.demos_taken_from],
                     to_split_sizes=[int(self.demos_pool_size)],
@@ -79,7 +124,7 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
             sampler.set_size(self.num_demos)
             self.steps.append(
-                SpreadSplit(
                     source_stream=self.demos_pool_name,
                     target_field=self.demos_field,
                     sampler=sampler,
@@ -87,12 +132,15 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
             )
         self.train_refiner.max_instances = self.max_train_instances
         self.steps.append(self.train_refiner)
         self.validation_refiner.max_instances = self.max_validation_instances
         self.steps.append(self.validation_refiner)
         self.test_refiner.max_instances = self.max_test_instances
         self.steps.append(self.test_refiner)
         render = StandardRenderer(
@@ -104,6 +152,9 @@ class BaseRecipe(Recipe, SourceSequntialOperator):
         self.steps.append(render)
         postprocessors = render.get_postprocessors()
         self.steps.append(
@@ -122,10 +173,21 @@ class StandardRecipeWithIndexes(BaseRecipe):
     def prepare(self):
         assert (
             self.template_card_index is None or self.template is None
-        ), "Specify either template or template_card_index"
         if self.template_card_index is not None:
-            self.template = self.card.templates[int(self.template_card_index)]
         assert (
             self.instruction_card_index is None or self.instruction is None
         ), "Specify either instruction or instruction_card_index"
@@ -136,9 +198,9 @@ class StandardRecipeWithIndexes(BaseRecipe):
 class StandardRecipe(StandardRecipeWithIndexes):
-    """
-    This class represents a standard recipe for data processing and preperation.
-    This class can be used to prepare a recipe
     with all necessary steps, refiners and renderers included. It allows to set various
     parameters and steps in a sequential manner for preparing the recipe.
@@ -146,6 +208,7 @@ class StandardRecipe(StandardRecipeWithIndexes):
         card (TaskCard): TaskCard object associated with the recipe.
         template (Template, optional): Template object to be used for the recipe.
         instruction (Instruction, optional): Instruction object to be used for the recipe.
         format (ICLFormat, optional): ICLFormat object to be used for the recipe.
         train_refiner (StreamRefiner, optional): Train refiner to be used in the recipe.
         max_train_instances (int, optional): Maximum training instances for the refiner.
@@ -160,6 +223,7 @@ class StandardRecipe(StandardRecipeWithIndexes):
         demos_field (str, optional): Field name for demos. Default is "demos".
         sampler (Sampler, optional): Sampler object to be used in the recipe.
         steps (List[StreamingOperator], optional): List of StreamingOperator objects to be used in the recipe.
         instruction_card_index (int, optional): Index of instruction card to be used
             for preparing the recipe.
         template_card_index (int, optional): Index of template card to be used for

+import logging
 from typing import List
 from .card import TaskCard
 from .dataclass import InternalField, OptionalField
 from .formats import ICLFormat
 from .instructions import Instruction
+from .operator import SourceSequentialOperator, StreamingOperator
+from .operators import Augmentor, NullAugmentor, StreamRefiner
 from .recipe import Recipe
 from .renderers import StandardRenderer
 from .schema import ToUnitxtGroup
 from .templates import Template
+# Used to give meaningful name to recipe steps
+class CreateDemosPool(SeparateSplit):
+    pass
+class AddDemosField(SpreadSplit):
+    pass
+class BaseRecipe(Recipe, SourceSequentialOperator):
     card: TaskCard
     template: Template = None
     instruction: Instruction = None
     format: ICLFormat = ICLFormat()
+    loader_limit: int = None
     max_train_instances: int = None
     max_validation_instances: int = None
     max_test_instances: int = None
+    train_refiner: StreamRefiner = OptionalField(default_factory=StreamRefiner)
+    validation_refiner: StreamRefiner = OptionalField(default_factory=StreamRefiner)
+    test_refiner: StreamRefiner = OptionalField(default_factory=StreamRefiner)
     demos_pool_size: int = None
     num_demos: int = 0
     demos_field: str = "demos"
     sampler: Sampler = None
+    augmentor: Augmentor = OptionalField(default_factory=NullAugmentor)
     steps: List[StreamingOperator] = InternalField(default_factory=list)
     def verify(self):
                 )
             if self.demos_pool_size < self.num_demos:
                 raise ValueError(
+                    f"demos_pool_size must be bigger than num_demos ({self.num_demos}), Got demos_pool_size={self.demos_pool_size}"
+                )
+            if self.loader_limit and self.demos_pool_size > self.loader_limit:
+                raise ValueError(
+                    f"demos_pool_size must be bigger than loader_limit ({self.loader_limit}), Got demos_pool_size={self.demos_pool_size}"
+                )
+        if self.loader_limit:
+            if self.max_test_instances and self.max_test_instances > self.loader_limit:
+                raise ValueError(
+                    f"max_test_instances must be bigger than loader_limit ({self.loader_limit}), Got max_test_instances={self.max_test_instances}"
+                )
+            if (
+                self.max_validation_instances
+                and self.max_validation_instances > self.loader_limit
+            ):
+                raise ValueError(
+                    f"max_validation_instances must be bigger than loader_limit ({self.loader_limit}), Got max_validation_instances={self.max_validation_instances}"
+                )
+            if (
+                self.max_train_instances
+                and self.max_train_instances > self.loader_limit
+            ):
+                raise ValueError(
+                    f"max_train_instances must be bigger than loader_limit ({self.loader_limit}), Got max_train_instances={self.max_train_instances}"
                 )
     def prepare(self):
             self.card.loader,
         ]
+        if self.loader_limit:
+            self.card.loader.loader_limit = self.loader_limit
+            logging.info(f"Loader line limit was set to  {self.loader_limit}")
+            self.steps.append(StreamRefiner(max_instances=self.loader_limit))
         if self.card.preprocess_steps is not None:
             self.steps.extend(self.card.preprocess_steps)
         self.steps.append(self.card.task)
+        if self.augmentor.augment_task_input:
+            self.augmentor.set_task_input_fields(self.card.task.augmentable_inputs)
+            self.steps.append(self.augmentor)
         if self.demos_pool_size is not None:
             self.steps.append(
+                CreateDemosPool(
                     from_split=self.demos_taken_from,
                     to_split_names=[self.demos_pool_name, self.demos_taken_from],
                     to_split_sizes=[int(self.demos_pool_size)],
             sampler.set_size(self.num_demos)
             self.steps.append(
+                AddDemosField(
                     source_stream=self.demos_pool_name,
                     target_field=self.demos_field,
                     sampler=sampler,
             )
         self.train_refiner.max_instances = self.max_train_instances
+        self.train_refiner.apply_to_streams = ["train"]
         self.steps.append(self.train_refiner)
         self.validation_refiner.max_instances = self.max_validation_instances
+        self.validation_refiner.apply_to_streams = ["validation"]
         self.steps.append(self.validation_refiner)
         self.test_refiner.max_instances = self.max_test_instances
+        self.test_refiner.apply_to_streams = ["test"]
         self.steps.append(self.test_refiner)
         render = StandardRenderer(
         self.steps.append(render)
+        if self.augmentor.augment_model_input:
+            self.steps.append(self.augmentor)
         postprocessors = render.get_postprocessors()
         self.steps.append(
     def prepare(self):
         assert (
             self.template_card_index is None or self.template is None
+        ), f"Specify either template ({self.template}) or template_card_index ({self.template_card_index}) but not both"
+        assert not (
+            self.template_card_index is None and self.template is None
+        ), "Specify either template or template_card_index in card"
         if self.template_card_index is not None:
+            try:
+                self.template = self.card.templates[self.template_card_index]
+            except Exception as e:
+                if isinstance(self.card.templates, dict):
+                    options = self.card.templates.keys()
+                else:
+                    options = list(range(0, len(self.card.templates)))
+                raise ValueError(
+                    f"card_template_index '{self.template_card_index}' is not in card. Available options: {options}"
+                ) from e
         assert (
             self.instruction_card_index is None or self.instruction is None
         ), "Specify either instruction or instruction_card_index"
 class StandardRecipe(StandardRecipeWithIndexes):
+    """This class represents a standard recipe for data processing and preperation.
+    This class can be used to prepare a recipe.
     with all necessary steps, refiners and renderers included. It allows to set various
     parameters and steps in a sequential manner for preparing the recipe.
         card (TaskCard): TaskCard object associated with the recipe.
         template (Template, optional): Template object to be used for the recipe.
         instruction (Instruction, optional): Instruction object to be used for the recipe.
+        loader_limit (int, optional): Specifies the maximum number of instances per stream to be returned from the loader (used to reduce loading time in large datasets)
         format (ICLFormat, optional): ICLFormat object to be used for the recipe.
         train_refiner (StreamRefiner, optional): Train refiner to be used in the recipe.
         max_train_instances (int, optional): Maximum training instances for the refiner.
         demos_field (str, optional): Field name for demos. Default is "demos".
         sampler (Sampler, optional): Sampler object to be used in the recipe.
         steps (List[StreamingOperator], optional): List of StreamingOperator objects to be used in the recipe.
+        augmentor (Augmentor) : Augmentor to be used to pseudo randomly augment the source text
         instruction_card_index (int, optional): Index of instruction card to be used
             for preparing the recipe.
         template_card_index (int, optional): Index of template card to be used for