Spaces:
Running
Running
| #!/usr/bin/env python | |
| """ | |
| Convert CLC-FCE dataset (The Cambridge Learner Corpus) to the parallel sentences format. | |
| """ | |
| import argparse | |
| import glob | |
| import os | |
| import re | |
| from xml.etree import cElementTree | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from tqdm import tqdm | |
| def annotate_fce_doc(xml): | |
| """Takes a FCE xml document and yields sentences with annotated errors.""" | |
| result = [] | |
| doc = cElementTree.fromstring(xml) | |
| paragraphs = doc.findall('head/text/*/coded_answer/p') | |
| for p in paragraphs: | |
| text = _get_formatted_text(p) | |
| result.append(text) | |
| return '\n'.join(result) | |
| def _get_formatted_text(elem, ignore_tags=None): | |
| text = elem.text or '' | |
| ignore_tags = [tag.upper() for tag in (ignore_tags or [])] | |
| correct = None | |
| mistake = None | |
| for child in elem.getchildren(): | |
| tag = child.tag.upper() | |
| if tag == 'NS': | |
| text += _get_formatted_text(child) | |
| elif tag == 'UNKNOWN': | |
| text += ' UNKNOWN ' | |
| elif tag == 'C': | |
| assert correct is None | |
| correct = _get_formatted_text(child) | |
| elif tag == 'I': | |
| assert mistake is None | |
| mistake = _get_formatted_text(child) | |
| elif tag in ignore_tags: | |
| pass | |
| else: | |
| raise ValueError(f"Unknown tag `{child.tag}`", text) | |
| if correct or mistake: | |
| correct = correct or '' | |
| mistake = mistake or '' | |
| if '=>' not in mistake: | |
| text += f'{{{mistake}=>{correct}}}' | |
| else: | |
| text += mistake | |
| text += elem.tail or '' | |
| return text | |
| def convert_fce(fce_dir): | |
| """Processes the whole FCE directory. Yields annotated documents (strings).""" | |
| # Ensure we got the valid dataset path | |
| if not os.path.isdir(fce_dir): | |
| raise UserWarning( | |
| f"{fce_dir} is not a valid path") | |
| dataset_dir = os.path.join(fce_dir, 'dataset') | |
| if not os.path.exists(dataset_dir): | |
| raise UserWarning( | |
| f"{fce_dir} doesn't point to a dataset's root dir") | |
| # Convert XML docs to the corpora format | |
| filenames = sorted(glob.glob(os.path.join(dataset_dir, '*/*.xml'))) | |
| docs = [] | |
| for filename in filenames: | |
| with open(filename, encoding='utf-8') as f: | |
| doc = annotate_fce_doc(f.read()) | |
| docs.append(doc) | |
| return docs | |
| def main(): | |
| fce = convert_fce(args.fce_dataset_path) | |
| with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \ | |
| open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied: | |
| for doc in tqdm(fce, unit='doc'): | |
| sents = re.split(r"\n +\n", doc) | |
| for sent in sents: | |
| tokenized_sents = sent_tokenize(sent) | |
| for i in range(len(tokenized_sents)): | |
| if re.search(r"[{>][.?!]$", tokenized_sents[i]): | |
| tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1] | |
| tokenized_sents[i] = "" | |
| regexp = r'{([^{}]*?)=>([^{}]*?)}' | |
| original = re.sub(regexp, r"\1", tokenized_sents[i]) | |
| applied = re.sub(regexp, r"\2", tokenized_sents[i]) | |
| # filter out nested alerts | |
| if original != "" and applied != "" and not re.search(r"[{}=]", original) \ | |
| and not re.search(r"[{}=]", applied): | |
| out_original.write(" ".join(word_tokenize(original)) + "\n") | |
| out_applied.write(" ".join(word_tokenize(applied)) + "\n") | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser(description=( | |
| "Convert CLC-FCE dataset to the parallel sentences format.")) | |
| parser.add_argument('fce_dataset_path', | |
| help='Path to the folder with the FCE dataset') | |
| parser.add_argument('--output', | |
| help='Path to the output folder') | |
| args = parser.parse_args() | |
| main() | |