Spaces:
Build error
Build error
| import os | |
| import datasets | |
| """Scisummnet dataset.""" | |
| _CITATION = """ | |
| @InProceedings{yasunaga&al.19.scisumm, | |
| title = {{ScisummNet}: A Large Annotated Corpus and Content-Impact Models for Scientific Paper Summarization with Citation Networks}, | |
| author = {Michihiro Yasunaga and Jungo Kasai and Rui Zhang and Alexander Fabbri and Irene Li and Dan Friedman and Dragomir Radev}, | |
| booktitle = {Proceedings of AAAI 2019}, | |
| year = {2019} | |
| } | |
| @InProceedings{yasunaga&al.17, | |
| title = {Graph-based Neural Multi-Document Summarization}, | |
| author = {Yasunaga, Michihiro and Zhang, Rui and Meelu, Kshitijh and Pareek, Ayush and Srinivasan, Krishnan and Radev, Dragomir R.}, | |
| booktitle = {Proceedings of CoNLL 2017}, | |
| year = {2017} | |
| } | |
| """ | |
| _DESCRIPTION = """ | |
| A summary of scientific papers should ideally incorporate the impact of the papers on the research community | |
| reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm), | |
| the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain. | |
| """ | |
| _HOMEPAGE = "https://cs.stanford.edu/~myasu/projects/scisumm_net/" | |
| _LICENSE = "CC BY-SA 4.0" | |
| _URLs = "https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip" | |
| class SummertimeScisummnet(datasets.GeneratorBasedBuilder): | |
| """Scisummnet dataset.""" | |
| VERSION = datasets.Version("1.1.0") | |
| BUILDER_CONFIGS = [ | |
| datasets.BuilderConfig(), | |
| ] | |
| def _info(self): | |
| features = datasets.Features( | |
| { | |
| "entry_number": datasets.Value("string"), | |
| "document_xml": datasets.Value("string"), | |
| "citing_sentences_annotated.json": datasets.Value("string"), | |
| "summary": datasets.Value("string"), | |
| } | |
| ) | |
| return datasets.DatasetInfo( | |
| description=_DESCRIPTION, | |
| features=features, | |
| supervised_keys=None, | |
| homepage=_HOMEPAGE, | |
| license=_LICENSE, | |
| citation=_CITATION, | |
| ) | |
| def _split_generators(self, dl_manager): | |
| """Returns SplitGenerators.""" | |
| my_urls = _URLs | |
| path = dl_manager.download_and_extract(my_urls) | |
| trainpath = os.path.join( | |
| path, "scisummnet_release1.1__20190413", "top1000_complete" | |
| ) | |
| return [ | |
| datasets.SplitGenerator( | |
| name=datasets.Split.TRAIN, | |
| # These kwargs will be passed to _generate_examples | |
| gen_kwargs={"extraction_path": trainpath, "split": "train"}, | |
| ) | |
| ] | |
| def _generate_examples(self, extraction_path, split): | |
| """Yields examples.""" | |
| for folder in os.listdir(extraction_path): | |
| entry = {} | |
| entry["entry_number"] = folder | |
| doc_xml_path = os.path.join( | |
| extraction_path, folder, "Documents_xml", folder + ".xml" | |
| ) | |
| with open(doc_xml_path, "r", encoding="utf-8") as f: | |
| entry["document_xml"] = f.read() | |
| cite_annot_path = os.path.join( | |
| extraction_path, folder, "citing_sentences_annotated.json" | |
| ) | |
| with open(cite_annot_path, "r", encoding="utf-8") as f: | |
| entry["citing_sentences_annotated.json"] = f.read() | |
| summary_path = os.path.join( | |
| extraction_path, folder, "summary", folder + ".gold.txt" | |
| ) | |
| with open(summary_path, "r", encoding="utf-8") as f: | |
| entry["summary"] = f.read() | |
| yield entry["entry_number"], entry | |