Spaces:
Runtime error
Runtime error
| # grab a dataset, prove we can save it | |
| from datasets import load_dataset | |
| raw_datasets = load_dataset("allocine") | |
| raw_datasets.save_to_disk("my-arrow-datasets") | |
| # load dataset from disk - prove we can reload it | |
| from datasets import load_from_disk | |
| arrow_datasets_reloaded = load_from_disk("my-arrow-datasets") | |
| arrow_datasets_reloaded | |
| #dataset_dict.save_to_disk("../data/wikipedia_rank_nocache") | |
| raw_datasets.save_to_disk("../data/awacke1=allocine") | |
| #prove the cache | |
| arrow_datasets_reloaded.cache_files | |
| # prove we can save in CSV | |
| for split, dataset in raw_datasets.items(): | |
| dataset.to_csv(f"my-dataset-{split}.csv", index=None) | |
| data_files = { | |
| "train": "my-dataset-train.csv", | |
| "validation": "my-dataset-validation.csv", | |
| "test": "my-dataset-test.csv", | |
| } | |
| csv_datasets_reloaded = load_dataset("csv", data_files=data_files) | |
| csv_datasets_reloaded | |
| # prove we can save in JSON | |
| for split, dataset in raw_datasets.items(): | |
| dataset.to_json(f"my-dataset-{split}.jsonl") | |
| json_data_files = { | |
| "train": "my-dataset-train.jsonl", | |
| "validation": "my-dataset-validation.jsonl", | |
| "test": "my-dataset-test.jsonl", | |
| } | |
| json_datasets_reloaded = load_dataset("json", data_files=json_data_files) | |
| json_datasets_reloaded | |
| # prove we can save in Parquet | |
| for split, dataset in raw_datasets.items(): | |
| dataset.to_parquet(f"my-dataset-{split}.parquet") | |
| parquet_data_files = { | |
| "train": "my-dataset-train.parquet", | |
| "validation": "my-dataset-validation.parquet", | |
| "test": "my-dataset-test.parquet", | |
| } | |
| parquet_datasets_reloaded = load_dataset("parquet", data_files=parquet_data_files) | |
| parquet_datasets_reloaded | |
| # prove we can save and load public local dataset on huggingface spaces | |
| raw_datasets.save_to_disk("awacke1/my-arrow-datasets") | |
| arrow_datasets_reloaded = load_from_disk("awacke1/my-arrow-datasets") | |
| thisworked="Yes really worked" | |
| arrow_datasets_reloaded | |
| thisworked | |
| #awacke1_public_datasets = load_dataset("awacke1/my-arrow-datasets") | |
| #awacke1_public_datasets | |