Spaces:

tom-doerr
/

logo_generator

Runtime error

boris commited on Jan 3, 2022

Commit

901ff72

1 Parent(s): fdf7698

feat: shard by host is optional

Files changed (2) hide show

dalle_mini/data.py CHANGED Viewed

@@ -27,6 +27,7 @@ class Dataset:
     do_train: bool = False
     do_eval: bool = True
     seed_dataset: int = None
     train_dataset: Dataset = field(init=False)
     eval_dataset: Dataset = field(init=False)
     rng_dataset: jnp.ndarray = field(init=False)
@@ -42,7 +43,11 @@ class Dataset:
                 if isinstance(f, str):
                     setattr(self, k, list(braceexpand(f)))
             # for list of files, split training data shards by host
-            if isinstance(self.train_file, list) and self.multi_hosts:
                 self.train_file = self.train_file[
                     jax.process_index() :: jax.process_count()
                 ]
@@ -185,7 +190,7 @@ class Dataset:
             first_loop = True
             while self.multi_hosts or first_loop:
                 # in multi-host, we run forever (no epoch) as hosts need to stop
-                # at same the time and we don't know how much data is on each host
                 if not first_loop:
                     # multi-host setting, we reshuffle shards
                     epoch += 1

     do_train: bool = False
     do_eval: bool = True
     seed_dataset: int = None
+    shard_by_host: bool = False
     train_dataset: Dataset = field(init=False)
     eval_dataset: Dataset = field(init=False)
     rng_dataset: jnp.ndarray = field(init=False)
                 if isinstance(f, str):
                     setattr(self, k, list(braceexpand(f)))
             # for list of files, split training data shards by host
+            if (
+                isinstance(self.train_file, list)
+                and self.multi_hosts
+                and self.shard_by_host
+            ):
                 self.train_file = self.train_file[
                     jax.process_index() :: jax.process_count()
                 ]
             first_loop = True
             while self.multi_hosts or first_loop:
                 # in multi-host, we run forever (no epoch) as hosts need to stop
+                # at the same time and we don't know how much data is on each host
                 if not first_loop:
                     # multi-host setting, we reshuffle shards
                     epoch += 1

tools/train/train.py CHANGED Viewed

@@ -112,16 +112,22 @@ class DataTrainingArguments:
         metadata={"help": "An optional input evaluation data file (glob acceptable)."},
     )
     # data loading should not be a bottleneck so we use "streaming" mode by default
-    streaming: bool = field(
         default=True,
         metadata={"help": "Whether to stream the dataset."},
     )
-    use_auth_token: bool = field(
         default=False,
         metadata={
             "help": "Whether to use the authentication token for private datasets."
         },
     )
     max_train_samples: Optional[int] = field(
         default=None,
         metadata={

         metadata={"help": "An optional input evaluation data file (glob acceptable)."},
     )
     # data loading should not be a bottleneck so we use "streaming" mode by default
+    streaming: Optional[bool] = field(
         default=True,
         metadata={"help": "Whether to stream the dataset."},
     )
+    use_auth_token: Optional[bool] = field(
         default=False,
         metadata={
             "help": "Whether to use the authentication token for private datasets."
         },
     )
+    shard_by_host: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether to shard data files by host in multi-host environments."
+        },
+    )
     max_train_samples: Optional[int] = field(
         default=None,
         metadata={