Spaces:
Sleeping
Sleeping
| # Copyright (c) 2023 Amphion. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # This module is modified from [Whisper](https://github.com/openai/whisper.git). | |
| # ## Citations | |
| # ```bibtex | |
| # @inproceedings{openai-whisper, | |
| # author = {Alec Radford and | |
| # Jong Wook Kim and | |
| # Tao Xu and | |
| # Greg Brockman and | |
| # Christine McLeavey and | |
| # Ilya Sutskever}, | |
| # title = {Robust Speech Recognition via Large-Scale Weak Supervision}, | |
| # booktitle = {{ICML}}, | |
| # series = {Proceedings of Machine Learning Research}, | |
| # volume = {202}, | |
| # pages = {28492--28518}, | |
| # publisher = {{PMLR}}, | |
| # year = {2023} | |
| # } | |
| # ``` | |
| # | |
| import re | |
| import unicodedata | |
| import regex | |
| # non-ASCII letters that are not separated by "NFKD" normalization | |
| ADDITIONAL_DIACRITICS = { | |
| "œ": "oe", | |
| "Œ": "OE", | |
| "ø": "o", | |
| "Ø": "O", | |
| "æ": "ae", | |
| "Æ": "AE", | |
| "ß": "ss", | |
| "ẞ": "SS", | |
| "đ": "d", | |
| "Đ": "D", | |
| "ð": "d", | |
| "Ð": "D", | |
| "þ": "th", | |
| "Þ": "th", | |
| "ł": "l", | |
| "Ł": "L", | |
| } | |
| def remove_symbols_and_diacritics(s: str, keep=""): | |
| """ | |
| Replace any other markers, symbols, and punctuations with a space, | |
| and drop any diacritics (category 'Mn' and some manual mappings) | |
| """ | |
| return "".join( | |
| c | |
| if c in keep | |
| else ADDITIONAL_DIACRITICS[c] | |
| if c in ADDITIONAL_DIACRITICS | |
| else "" | |
| if unicodedata.category(c) == "Mn" | |
| else " " | |
| if unicodedata.category(c)[0] in "MSP" | |
| else c | |
| for c in unicodedata.normalize("NFKD", s) | |
| ) | |
| def remove_symbols(s: str): | |
| """ | |
| Replace any other markers, symbols, punctuations with a space, keeping diacritics | |
| """ | |
| return "".join( | |
| " " if unicodedata.category(c)[0] in "MSP" else c | |
| for c in unicodedata.normalize("NFKC", s) | |
| ) | |
| class BasicTextNormalizer: | |
| def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): | |
| self.clean = ( | |
| remove_symbols_and_diacritics if remove_diacritics else remove_symbols | |
| ) | |
| self.split_letters = split_letters | |
| def __call__(self, s: str): | |
| s = s.lower() | |
| s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets | |
| s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis | |
| s = self.clean(s).lower() | |
| if self.split_letters: | |
| s = " ".join(regex.findall(r"\X", s, regex.U)) | |
| s = re.sub( | |
| r"\s+", " ", s | |
| ) # replace any successive whitespace characters with a space | |
| return s | |