Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| ''' | |
| @Author : Jiangjie Chen | |
| @Time : 2020/7/20 17:34 | |
| @Contact : jjchen19@fudan.edu.cn | |
| @Description: | |
| ''' | |
| import tensorflow as tf | |
| import cjjpy as cjj | |
| import os | |
| import re | |
| import ujson as json | |
| from collections import defaultdict | |
| pj_prefix = cjj.AbsParentDir(__file__, '..') | |
| class FEVERLoader: | |
| def __init__(self, role): | |
| role = 'dev' if role == 'val' else role | |
| assert role in ['train', 'dev', 'test', 'eval'] | |
| self.role = role | |
| self.fever_data = defaultdict(dict) | |
| self.SUPPORTS = 'SUPPORTS' | |
| self.REFUTES = 'REFUTES' | |
| self.NEI = 'NOT ENOUGH INFO' | |
| def __iter__(self): | |
| for k in self.fever_data: | |
| yield k | |
| def __len__(self): | |
| return len(self.fever_data) | |
| def __getitem__(self, item): | |
| return self.fever_data[item] | |
| def load_fever(self, retrieve_type='bert', clean_load=True): | |
| self._load_fever_golden() | |
| self._load_fever_all() | |
| self._load_fever_retrieved(retrieve_type, clean_load) | |
| def _load_json(self, fname): | |
| with tf.io.gfile.GFile(fname) as f: | |
| return [json.loads(x) for x in f.readlines()] | |
| def _new_role(self): | |
| role = self.role if self.role != 'eval' else 'dev' | |
| return role | |
| def _load_fever_golden(self): | |
| if self.role == 'test': | |
| postfix = f'data/fever/shared_task_test.jsonl' | |
| for js in self._load_json(f'{pj_prefix}/{postfix}'): | |
| self.fever_data[js['id']].update({ | |
| 'id': js['id'], | |
| 'claim': js['claim'] | |
| }) | |
| else: | |
| role = self._new_role() | |
| postfix = f'data/fever/baked_data/golden_{role}.json' | |
| for js in self._load_json(f'{pj_prefix}/{postfix}'): | |
| self.fever_data[js['id']].update({ | |
| 'id': js['id'], | |
| 'claim': js['claim'], | |
| 'label': js['label'], | |
| 'golden_evidence': self._clean_evidence(js['evidence']) | |
| }) | |
| print('* FEVER golden loaded.') | |
| def _load_fever_all(self): | |
| role = self._new_role() | |
| postfix = f'data/fever/baked_data/all_{role}.json' | |
| for js in self._load_json(f'{pj_prefix}/{postfix}'): | |
| self.fever_data[js['id']].update({ | |
| 'all_evidence': self._clean_evidence(js['evidence']) | |
| }) | |
| print('* FEVER all loaded.') | |
| def _load_fever_retrieved(self, retrieve_type, clean_load): | |
| assert retrieve_type in ['bert'] | |
| postfix = f'data/fever/baked_data/{retrieve_type}_{self.role}.json' | |
| for js in self._load_json(f'{pj_prefix}/{postfix}'): | |
| self.fever_data[js['id']].update({ | |
| f'{retrieve_type}_evidence': self._clean_evidence(js['evidence']) if clean_load else js['evidence'] | |
| }) | |
| print(f'* FEVER {retrieve_type} loaded.') | |
| def clean_text(self, sentence): | |
| sentence = re.sub(" \-LSB\-.*?\-RSB\-", "", sentence) | |
| sentence = re.sub("\-LRB\- \-RRB\- ", "", sentence) | |
| sentence = re.sub(" -LRB-", " ( ", sentence) | |
| sentence = re.sub("-RRB-", " )", sentence) | |
| sentence = re.sub(" LSB.*?RSB", "", sentence) | |
| sentence = re.sub("LRB RRB ", "", sentence) | |
| sentence = re.sub("LRB", " ( ", sentence) | |
| sentence = re.sub("RRB", " )", sentence) | |
| sentence = re.sub("--", "-", sentence) | |
| sentence = re.sub("``", '"', sentence) | |
| sentence = re.sub("''", '"', sentence) | |
| sentence = re.sub(' ', ' ', sentence) | |
| return sentence | |
| def clean_title(self, title): | |
| title = re.sub("_", " ", title) | |
| title = re.sub(" -LRB-", " ( ", title) | |
| title = re.sub("-RRB-", " )", title) | |
| title = re.sub("-COLON-", ":", title) | |
| title = re.sub(' ', ' ', title) | |
| return title | |
| def _clean_evidence(self, evidence): | |
| cev = [] | |
| for ev in evidence: | |
| if len(ev) == 4: | |
| cev.append([self.clean_title(ev[0]), ev[1], self.clean_text(ev[2]), ev[3]]) | |
| elif len(ev) == 3: | |
| cev.append([self.clean_title(ev[0]), ev[1], self.clean_text(ev[2])]) | |
| elif len(ev) == 0: | |
| cev.append(ev) | |
| else: | |
| raise ValueError(ev) | |
| return cev | |
| if __name__ == '__main__': | |
| floader = FEVERLoader('test') | |
| floader.load_fever('bert', clean_load=False) | |
| for k in floader: | |
| print(floader[k]) | |
| input() | |