Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # Copyright 2020 The HuggingFace Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from pathlib import Path | |
| import fire | |
| from tqdm import tqdm | |
| def download_wmt_dataset(src_lang="ro", tgt_lang="en", dataset="wmt16", save_dir=None) -> None: | |
| """Download a dataset using the datasets package and save it to the format expected by finetune.py | |
| Format of save_dir: train.source, train.target, val.source, val.target, test.source, test.target. | |
| Args: | |
| src_lang: <str> source language | |
| tgt_lang: <str> target language | |
| dataset: <str> wmt16, wmt17, etc. wmt16 is a good start as it's small. To get the full list run `import datasets; print([d.id for d in datasets.list_datasets() if "wmt" in d.id])` | |
| save_dir: <str>, where to save the datasets, defaults to f'{dataset}-{src_lang}-{tgt_lang}' | |
| Usage: | |
| >>> download_wmt_dataset('ro', 'en', dataset='wmt16') # saves to wmt16-ro-en | |
| """ | |
| try: | |
| import datasets | |
| except (ModuleNotFoundError, ImportError): | |
| raise ImportError("run pip install datasets") | |
| pair = f"{src_lang}-{tgt_lang}" | |
| print(f"Converting {dataset}-{pair}") | |
| ds = datasets.load_dataset(dataset, pair) | |
| if save_dir is None: | |
| save_dir = f"{dataset}-{pair}" | |
| save_dir = Path(save_dir) | |
| save_dir.mkdir(exist_ok=True) | |
| for split in ds.keys(): | |
| print(f"Splitting {split} with {ds[split].num_rows} records") | |
| # to save to val.source, val.target like summary datasets | |
| fn = "val" if split == "validation" else split | |
| src_path = save_dir.joinpath(f"{fn}.source") | |
| tgt_path = save_dir.joinpath(f"{fn}.target") | |
| src_fp = src_path.open("w+") | |
| tgt_fp = tgt_path.open("w+") | |
| # reader is the bottleneck so writing one record at a time doesn't slow things down | |
| for x in tqdm(ds[split]): | |
| ex = x["translation"] | |
| src_fp.write(ex[src_lang] + "\n") | |
| tgt_fp.write(ex[tgt_lang] + "\n") | |
| print(f"Saved {dataset} dataset to {save_dir}") | |
| if __name__ == "__main__": | |
| fire.Fire(download_wmt_dataset) | |