Spaces:
Runtime error
Runtime error
| # coding=utf-8 | |
| # Copyright 2021 The HuggingFace Datasets Authors, The Google AI Language Team Authors and the current dataset script contributor. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """The WikiTableQuestions dataset is for the task of question answering on semi-structured HTML tables""" | |
| import json | |
| import os | |
| import datasets | |
| from utils.wtq.utils import _load_table_w_page as _load_table | |
| # Find for instance the citation on arxiv or on the dataset repo/website | |
| _CITATION = """\ | |
| @inproceedings{pasupat-liang-2015-compositional, | |
| title = "Compositional Semantic Parsing on Semi-Structured Tables", | |
| author = "Pasupat, Panupong and | |
| Liang, Percy", | |
| booktitle = "Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", | |
| month = jul, | |
| year = "2015", | |
| address = "Beijing, China", | |
| publisher = "Association for Computational Linguistics", | |
| url = "https://aclanthology.org/P15-1142", | |
| doi = "10.3115/v1/P15-1142", | |
| pages = "1470--1480", | |
| } | |
| """ | |
| _DESCRIPTION = """\ | |
| Two important aspects of semantic parsing for question answering are the breadth of the knowledge source and the depth of | |
| logical compositionality. While existing work trades off one aspect for another, this paper simultaneously makes progress | |
| on both fronts through a new task: answering complex questions on semi-structured tables using question-answer pairs as | |
| supervision. The central challenge arises from two compounding factors: the broader domain results in an open-ended set | |
| of relations, and the deeper compositionality results in a combinatorial explosion in the space of logical forms. We | |
| propose a logical-form driven parsing algorithm guided by strong typing constraints and show that it obtains significant | |
| improvements over natural baselines. For evaluation, we created a new dataset of 22,033 complex questions on Wikipedia | |
| tables, which is made publicly available. | |
| """ | |
| _HOMEPAGE = "https://ppasupat.github.io/WikiTableQuestions/" | |
| _LICENSE = "CC-BY-SA-4.0 License" | |
| _URL = "https://github.com/ppasupat/WikiTableQuestions/archive/refs/heads/master.zip" | |
| _SQUALL_URL = "https://github.com/tzshi/squall/archive/refs/heads/main.zip" | |
| class WikiTableQuestion(datasets.GeneratorBasedBuilder): | |
| """The WikiTableQuestions dataset""" | |
| def _info(self): | |
| return datasets.DatasetInfo( | |
| description=_DESCRIPTION, | |
| features=datasets.Features( | |
| { | |
| "id": datasets.Value("string"), | |
| "question": datasets.Value("string"), | |
| "table_id": datasets.Value("string"), | |
| "table": {"page_title": datasets.Value("string"), | |
| "header": datasets.features.Sequence(datasets.Value("string")), | |
| "rows": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string")))}, | |
| "answer_text": datasets.features.Sequence(datasets.Value("string")), | |
| } | |
| ), | |
| supervised_keys=None, | |
| homepage=_HOMEPAGE, | |
| license=_LICENSE, | |
| citation=_CITATION, | |
| ) | |
| def _split_generators(self, dl_manager): | |
| """Returns SplitGenerators.""" | |
| data_dir = os.path.join(dl_manager.download_and_extract(_URL), 'WikiTableQuestions-master') | |
| squall_dir = os.path.join(dl_manager.download_and_extract(_SQUALL_URL), 'squall-main') | |
| return [ | |
| datasets.SplitGenerator( | |
| name=datasets.Split.TRAIN, | |
| gen_kwargs={"filepath": os.path.join(data_dir, "data/random-split-1-train.tsv"), | |
| "data_dir": data_dir, | |
| "squall_path": os.path.join(squall_dir, "data/squall.json")}, | |
| ), | |
| datasets.SplitGenerator( | |
| name=datasets.Split.VALIDATION, | |
| gen_kwargs={"filepath": os.path.join(data_dir, "data/random-split-1-dev.tsv"), | |
| "data_dir": data_dir, | |
| "squall_path": os.path.join(squall_dir, "data/squall.json")}, | |
| ), | |
| datasets.SplitGenerator( | |
| name=datasets.Split.TEST, | |
| gen_kwargs={"filepath": os.path.join(data_dir, "data/pristine-unseen-tables.tsv"), | |
| "data_dir": data_dir, | |
| "squall_path": os.path.join(squall_dir, "data/squall.json")}, | |
| ), | |
| ] | |
| def _generate_examples(self, filepath, data_dir, squall_path): | |
| """Yields examples.""" | |
| squall_id_list = [] | |
| with open(squall_path) as f: | |
| squall_data = json.load(f) | |
| for squall_item in squall_data: | |
| squall_id_list.append(squall_item["nt"]) | |
| # data_id, question, table_id, gold_result_str | |
| with open(filepath, encoding="utf-8") as f: | |
| for idx, line in enumerate(f): | |
| # skip the header | |
| if idx == 0: | |
| continue | |
| data_id, question, table_id, gold_result_str = line.strip("\n").split("\t") | |
| if data_id not in squall_id_list: | |
| gold_result = gold_result_str.split('|') | |
| yield idx, { | |
| "id": data_id, | |
| "question": question, | |
| "table_id": table_id, | |
| "table": _load_table(os.path.join(data_dir, table_id.replace('.csv', '.tsv'))), | |
| # convert the .csv postfix to .tsv, for easier read-in | |
| "answer_text": gold_result, | |
| } | |
| else: | |
| continue | |