Spaces:
Runtime error
Runtime error
| import json | |
| from tqdm import tqdm | |
| import sys | |
| from viet_text_tools import normalize_diacritics | |
| sys.path.append("..") | |
| from utils.logger import get_logger | |
| import re | |
| vsec_path = "../data/vsec/VSEC.jsonl" | |
| test_file = open("../data/vsec/vsec.test", "w+") | |
| test_noise_file = open("../data/vsec/vsec.test.noise", "w+") | |
| with open(vsec_path, "r") as file: | |
| data = [json.loads(x[0:-1]) for x in file.readlines()] | |
| def get_true_text(sentence: dict): | |
| true_tokens = [] | |
| for word in sentence['annotations']: | |
| if word['is_correct'] == True: | |
| true_tokens.append(word['current_syllable']) | |
| else: | |
| true_tokens.append(word['alternative_syllables'][0]) | |
| true_sentence = " ".join(true_tokens) | |
| words = re.findall("\w+|[^\w\s]{1}", true_sentence) | |
| return " ".join(words) | |
| def get_noise_text(sentence: dict): | |
| noised_tokens = [] | |
| for word in sentence['annotations']: | |
| noised_tokens.append(word['current_syllable']) | |
| noised_sentence = " ".join(noised_tokens) | |
| words = re.findall("\w+|[^\w\s]{1}", noised_sentence) | |
| noised_tokens = [] | |
| for word in words: | |
| new_word = normalize_diacritics(word) | |
| noised_tokens.append(new_word) | |
| return " ".join(noised_tokens) | |
| for sentence in tqdm(data): | |
| true_text = get_true_text(sentence) | |
| noised_text = get_noise_text(sentence) | |
| test_file.write(true_text + "\n") | |
| test_noise_file.write(noised_text + "\n") | |
| test_file.close() | |
| test_noise_file.close() |