Upload loaders.py with huggingface_hub
Browse files- loaders.py +24 -0
loaders.py
CHANGED
|
@@ -34,6 +34,7 @@ from datasets import load_dataset as hf_load_dataset
|
|
| 34 |
from tqdm import tqdm
|
| 35 |
|
| 36 |
from .dataclass import InternalField, OptionalField
|
|
|
|
| 37 |
from .logging_utils import get_logger
|
| 38 |
from .operator import SourceOperator
|
| 39 |
from .settings_utils import get_settings
|
|
@@ -449,3 +450,26 @@ class LoadFromIBMCloud(Loader):
|
|
| 449 |
)
|
| 450 |
|
| 451 |
return MultiStream.from_iterables(dataset)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
from tqdm import tqdm
|
| 35 |
|
| 36 |
from .dataclass import InternalField, OptionalField
|
| 37 |
+
from .fusion import FixedFusion
|
| 38 |
from .logging_utils import get_logger
|
| 39 |
from .operator import SourceOperator
|
| 40 |
from .settings_utils import get_settings
|
|
|
|
| 450 |
)
|
| 451 |
|
| 452 |
return MultiStream.from_iterables(dataset)
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
class MultipleSourceLoader(Loader):
|
| 456 |
+
"""Allow loading data from multiple sources.
|
| 457 |
+
|
| 458 |
+
Examples:
|
| 459 |
+
1) Loading the train split from Huggingface hub and the test set from a local file:
|
| 460 |
+
|
| 461 |
+
MultipleSourceLoader(loaders = [ LoadHF(path="public/data",split="train"), LoadCSV({"test": "mytest.csv"}) ])
|
| 462 |
+
|
| 463 |
+
2) Loading a test set combined from two files
|
| 464 |
+
|
| 465 |
+
MultipleSourceLoader(loaders = [ LoadCSV({"test": "mytest1.csv"}, LoadCSV({"test": "mytest2.csv"}) ])
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
"""
|
| 469 |
+
|
| 470 |
+
sources: List[Loader]
|
| 471 |
+
|
| 472 |
+
def process(self):
|
| 473 |
+
return FixedFusion(
|
| 474 |
+
origins=self.sources, max_instances_per_origin=self.get_limit()
|
| 475 |
+
).process()
|