Spaces:
Build error
Build error
| import os | |
| import json | |
| import datasets | |
| """Arxiv dataset.""" | |
| _CITATION = """ | |
| @article{Cohan_2018, | |
| title={A Discourse-Aware Attention Model for Abstractive Summarization of | |
| Long Documents}, | |
| url={http://dx.doi.org/10.18653/v1/n18-2097}, | |
| DOI={10.18653/v1/n18-2097}, | |
| journal={Proceedings of the 2018 Conference of the North American Chapter of | |
| the Association for Computational Linguistics: Human Language | |
| Technologies, Volume 2 (Short Papers)}, | |
| publisher={Association for Computational Linguistics}, | |
| author={Cohan, Arman and Dernoncourt, Franck and Kim, Doo Soon and Bui, Trung and Kim, Seokhwan and Chang, Walter and Goharian, Nazli}, | |
| year={2018} | |
| } | |
| """ | |
| _DESCRIPTION = """ | |
| A summarization dataset comprised of pairs of scientific papers. | |
| The dataset provides a challenging testbed for abstractive summarization. | |
| It contains papers and their abstracts. | |
| """ | |
| _HOMEPAGE = "https://github.com/armancohan/long-summarization" | |
| _LICENSE = "Apache-2.0 License" | |
| _URL = "https://archive.org/download/armancohan-long-summarization-paper-code/arxiv-dataset.zip" | |
| class SummertimeArxiv(datasets.GeneratorBasedBuilder): | |
| """Arxiv long summarization dataset.""" | |
| VERSION = datasets.Version("1.0.0") | |
| BUILDER_CONFIGS = [ | |
| datasets.BuilderConfig(), | |
| ] | |
| def _info(self): | |
| features = datasets.Features( | |
| { | |
| "article_id": datasets.Value("string"), | |
| "article_text": [datasets.Value("string")], | |
| "abstract_text": [datasets.Value("string")], | |
| } | |
| ) | |
| return datasets.DatasetInfo( | |
| description=_DESCRIPTION, | |
| features=features, | |
| supervised_keys=None, | |
| homepage=_HOMEPAGE, | |
| license=_LICENSE, | |
| citation=_CITATION, | |
| ) | |
| def _split_generators(self, dl_manager): | |
| """Returns SplitGenerators.""" | |
| my_urls = _URL | |
| path = dl_manager.download_and_extract(my_urls) | |
| path = os.path.join(path, "arxiv-dataset") | |
| trainpath = os.path.join(path, "train.txt") | |
| valpath = os.path.join(path, "val.txt") | |
| testpath = os.path.join(path, "test.txt") | |
| return [ | |
| datasets.SplitGenerator( | |
| name=datasets.Split.TRAIN, | |
| # These kwargs will be passed to _generate_examples | |
| gen_kwargs={"filepath": trainpath, "split": "train"}, | |
| ), | |
| datasets.SplitGenerator( | |
| name=datasets.Split.VALIDATION, | |
| # These kwargs will be passed to _generate_examples | |
| gen_kwargs={"filepath": valpath, "split": "val"}, | |
| ), | |
| datasets.SplitGenerator( | |
| name=datasets.Split.TEST, | |
| # These kwargs will be passed to _generate_examples | |
| gen_kwargs={"filepath": testpath, "split": "test"}, | |
| ), | |
| ] | |
| def _generate_examples(self, filepath, split): | |
| """Yields examples.""" | |
| with open(filepath, "r") as f: | |
| for line in f: | |
| instance = json.loads(line) | |
| entry = {} | |
| entry["article_id"] = instance["article_id"] | |
| entry["article_text"] = instance["article_text"] | |
| entry["abstract_text"] = instance["abstract_text"] | |
| yield entry["article_id"], entry | |