Spaces:

akhaliq
/

SummerTime

Build error

SummerTime / dataset /non_huggingface_datasets_builders /scisummnet.py

akhaliq3

spaces demo

546a9ba about 4 years ago

3.63 kB

	import os
	import datasets


	"""Scisummnet dataset."""


	_CITATION = """
	@InProceedings{yasunaga&al.19.scisumm,
	title = {{ScisummNet}: A Large Annotated Corpus and Content-Impact Models for Scientific Paper Summarization with Citation Networks},
	author = {Michihiro Yasunaga and Jungo Kasai and Rui Zhang and Alexander Fabbri and Irene Li and Dan Friedman and Dragomir Radev},
	booktitle = {Proceedings of AAAI 2019},
	year = {2019}
	}
	@InProceedings{yasunaga&al.17,
	title = {Graph-based Neural Multi-Document Summarization},
	author = {Yasunaga, Michihiro and Zhang, Rui and Meelu, Kshitijh and Pareek, Ayush and Srinivasan, Krishnan and Radev, Dragomir R.},
	booktitle = {Proceedings of CoNLL 2017},
	year = {2017}
	}
	"""

	_DESCRIPTION = """
	A summary of scientific papers should ideally incorporate the impact of the papers on the research community
	reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm),
	the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain.
	"""

	_HOMEPAGE = "https://cs.stanford.edu/~myasu/projects/scisumm_net/"

	_LICENSE = "CC BY-SA 4.0"

	_URLs = "https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip"


	class SummertimeScisummnet(datasets.GeneratorBasedBuilder):
	"""Scisummnet dataset."""

	VERSION = datasets.Version("1.1.0")

	BUILDER_CONFIGS = [
	datasets.BuilderConfig(),
	]

	def _info(self):
	features = datasets.Features(
	{
	"entry_number": datasets.Value("string"),
	"document_xml": datasets.Value("string"),
	"citing_sentences_annotated.json": datasets.Value("string"),
	"summary": datasets.Value("string"),
	}
	)
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=features,
	supervised_keys=None,
	homepage=_HOMEPAGE,
	license=_LICENSE,
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	my_urls = _URLs
	path = dl_manager.download_and_extract(my_urls)
	trainpath = os.path.join(
	path, "scisummnet_release1.1__20190413", "top1000_complete"
	)
	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={"extraction_path": trainpath, "split": "train"},
	)
	]

	def _generate_examples(self, extraction_path, split):
	"""Yields examples."""

	for folder in os.listdir(extraction_path):

	entry = {}

	entry["entry_number"] = folder

	doc_xml_path = os.path.join(
	extraction_path, folder, "Documents_xml", folder + ".xml"
	)
	with open(doc_xml_path, "r", encoding="utf-8") as f:
	entry["document_xml"] = f.read()

	cite_annot_path = os.path.join(
	extraction_path, folder, "citing_sentences_annotated.json"
	)
	with open(cite_annot_path, "r", encoding="utf-8") as f:
	entry["citing_sentences_annotated.json"] = f.read()

	summary_path = os.path.join(
	extraction_path, folder, "summary", folder + ".gold.txt"
	)
	with open(summary_path, "r", encoding="utf-8") as f:
	entry["summary"] = f.read()

	yield entry["entry_number"], entry