Spaces:

akhaliq
/

SummerTime

Build error

SummerTime / dataset /non_huggingface_datasets_builders /arxiv_longsummarization.py

akhaliq3

spaces demo

546a9ba about 4 years ago

3.39 kB

	import os
	import json
	import datasets


	"""Arxiv dataset."""


	_CITATION = """
	@article{Cohan_2018,
	title={A Discourse-Aware Attention Model for Abstractive Summarization of
	Long Documents},
	url={http://dx.doi.org/10.18653/v1/n18-2097},
	DOI={10.18653/v1/n18-2097},
	journal={Proceedings of the 2018 Conference of the North American Chapter of
	the Association for Computational Linguistics: Human Language
	Technologies, Volume 2 (Short Papers)},
	publisher={Association for Computational Linguistics},
	author={Cohan, Arman and Dernoncourt, Franck and Kim, Doo Soon and Bui, Trung and Kim, Seokhwan and Chang, Walter and Goharian, Nazli},
	year={2018}
	}
	"""

	_DESCRIPTION = """
	A summarization dataset comprised of pairs of scientific papers.
	The dataset provides a challenging testbed for abstractive summarization.
	It contains papers and their abstracts.
	"""

	_HOMEPAGE = "https://github.com/armancohan/long-summarization"

	_LICENSE = "Apache-2.0 License"

	_URL = "https://archive.org/download/armancohan-long-summarization-paper-code/arxiv-dataset.zip"


	class SummertimeArxiv(datasets.GeneratorBasedBuilder):
	"""Arxiv long summarization dataset."""

	VERSION = datasets.Version("1.0.0")

	BUILDER_CONFIGS = [
	datasets.BuilderConfig(),
	]

	def _info(self):
	features = datasets.Features(
	{
	"article_id": datasets.Value("string"),
	"article_text": [datasets.Value("string")],
	"abstract_text": [datasets.Value("string")],
	}
	)
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=features,
	supervised_keys=None,
	homepage=_HOMEPAGE,
	license=_LICENSE,
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	my_urls = _URL
	path = dl_manager.download_and_extract(my_urls)
	path = os.path.join(path, "arxiv-dataset")

	trainpath = os.path.join(path, "train.txt")
	valpath = os.path.join(path, "val.txt")
	testpath = os.path.join(path, "test.txt")

	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={"filepath": trainpath, "split": "train"},
	),
	datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={"filepath": valpath, "split": "val"},
	),
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={"filepath": testpath, "split": "test"},
	),
	]

	def _generate_examples(self, filepath, split):
	"""Yields examples."""

	with open(filepath, "r") as f:
	for line in f:

	instance = json.loads(line)

	entry = {}
	entry["article_id"] = instance["article_id"]
	entry["article_text"] = instance["article_text"]
	entry["abstract_text"] = instance["abstract_text"]

	yield entry["article_id"], entry