Spaces:
Running
Running
| # this script is used for importing random texts from folder and converting it for scorer | |
| import os | |
| import nltk | |
| import re | |
| nltk.download("punkt") | |
| FOLDER = "../data/текст/" | |
| OUT_FILE = "../data/texts.txt" | |
| text_file = open(OUT_FILE, mode="a") | |
| tokenizer = nltk.SpaceTokenizer() | |
| paranthesis_regex = re.compile(r'\(.*\)') | |
| allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л", | |
| "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "’"] | |
| for subdir, dirs, files in os.walk(FOLDER): | |
| for file in files: | |
| file_path = os.path.join(subdir, file) | |
| print(file_path) | |
| input_file = open(file_path) | |
| try: | |
| cleaned_text = input_file.read() | |
| except: | |
| input_file.close() | |
| input_file = open(file_path, encoding="cp1251") | |
| cleaned_text = input_file.read() | |
| cleaned_text = cleaned_text.lower() | |
| cleaned_text = cleaned_text.replace("'", "’") | |
| cleaned_text = paranthesis_regex.sub('', cleaned_text) | |
| cleaned_text = cleaned_text.strip() | |
| cleaned_text = cleaned_text.split(".") | |
| out_text = [] | |
| for text in cleaned_text: | |
| text = text.strip() | |
| words = tokenizer.tokenize(text) | |
| words = [i for i in words if not i.isdigit()] | |
| new_words = [] | |
| for word in words: | |
| include = True | |
| for letter in word: | |
| if word.startswith("-"): | |
| word = word[1:] | |
| if letter not in allowed_chars: | |
| include = False | |
| if include: | |
| new_words.append(word) | |
| words = new_words | |
| if all([len(i) <= 1 for i in words]): | |
| continue | |
| if len(words) == 0: | |
| continue | |
| out_text.append( | |
| " ".join(words)) | |
| cleaned_text = "\n".join(out_text) | |
| if cleaned_text == "": | |
| continue | |
| text_file.write(cleaned_text + "\n") | |
| input_file.close() | |
| text_file.close() | |