Kokoro-TTS / idn_phonemes.py
jonathanjordan21's picture
Update idn_phonemes.py
70d2a73 verified
import re
# --- IPA map ---
ipa_map = {
"ng": "ŋ", "ny": "ɲ", "sy": "ʃ", "kh": "x", "c": "tʃ", "j": "dʒ",
"b": "b", "d": "d̪", "t": "t̪", "g": "ɡ", "k": "k", "p": "p",
"m": "m", "n": "n", "l": "l", "s": "s", "h": "h", "r": "r", "w": "w", "y": "j",
"a": "a", "i": "i", "u": "u", "o": "o", "e": "ə"
}
# Sebutan huruf
letter_words = {
"a":"a","b":"be","c":"ce","d":"de","e":"e","f":"ef","g":"ge","h":"ha",
"i":"i","j":"je","k":"ka","l":"el","m":"em","n":"en","o":"o","p":"pe",
"q":"ki","r":"er","s":"es","t":"te","u":"u","v":"fe","w":"we","x":"eks",
"y":"ye","z":"zet"
}
digit_words = {
"0":"nol","1":"satu","2":"dua","3":"tiga","4":"empat",
"5":"lima","6":"enam","7":"tujuh","8":"delapan","9":"sembilan"
}
# --- Number to words (hingga triliun) ---
def number_to_words(n: int) -> str:
n = int(n)
if n == 0:
return "nol"
def _below_thousand(x):
words = ["nol","satu","dua","tiga","empat","lima","enam","tujuh","delapan","sembilan","sepuluh","sebelas"]
if x < 12: return words[x]
if x < 20: return _below_thousand(x-10)+" belas"
if x < 100:
q,r=divmod(x,10); return _below_thousand(q)+" puluh"+((" "+_below_thousand(r)) if r else "")
if x < 200: return "seratus"+((" "+_below_thousand(x-100)) if x>100 else "")
if x < 1000:
q,r=divmod(x,100); return _below_thousand(q)+" ratus"+((" "+_below_thousand(r)) if r else "")
scales=[(1_000_000_000_000,"triliun"),(1_000_000_000,"miliar"),(1_000_000,"juta"),(1000,"ribu")]
parts=[]; remaining=n
for v,nm in scales:
if remaining>=v:
q,remaining=divmod(remaining,v)
if v==1000 and q==1: parts.append("seribu")
else: parts.append(number_to_words(q)+" "+nm)
if remaining: parts.append(_below_thousand(remaining))
return " ".join(parts)
# --- Nomor HP ---
phone_pattern=re.compile(r'(?<!\w)(?:\+62|\d)\d{7,}(?!\w)')
def expand_phones(text:str)->str:
def repl(m):
digits=re.findall(r'\d',m.group(0))
return " ".join(digit_words[d] for d in digits)
return phone_pattern.sub(repl,text)
# --- Angka umum ---
def expand_numbers(text:str)->str:
def repl(m):
return number_to_words(int(m.group()))
return re.sub(r'\d+',repl,text)
# --- Singkatan ---
abbr_pattern=re.compile(r'(?<!\w)([A-Z]{2,})(?!\w)')
def expand_abbreviations(text:str)->str:
def repl(m):
token=m.group(1)
if token=="HP": # <-- jangan expand 'HP' kalau berdiri sendiri
return "ha pe"
return " ".join(letter_words[ch.lower()] for ch in token)
return abbr_pattern.sub(repl,text)
# --- IPA ---
def apply_ipa_map(text:str)->str:
t=text.lower()
for k in sorted(ipa_map,key=len,reverse=True):
t=re.sub(re.escape(k),ipa_map[k],t)
return re.sub(r'\s+',' ',t).strip()
# --- Pipeline ---
def indo_to_ipa(text:str)->str:
# 1. nomor HP
step1=expand_phones(text)
# 2. angka biasa
step2=expand_numbers(step1)
# 3. singkatan
step3=expand_abbreviations(step2)
# 4. mapping IPA
return apply_ipa_map(step3)
# import re
# ipa_map = {
# "ng": "ŋ",
# "ny": "ɲ",
# "sy": "ʃ",
# "kh": "x",
# "c": "tʃ",
# "j": "dʒ",
# "y": "j",
# "r": "r",
# "x": "ks",
# "a": "a",
# "i": "i",
# "u": "u",
# "e": "ə",
# "o": "o",
# "b": "b",
# "d": "d̪",
# "t": "t̪",
# "g": "ɡ",
# "k": "k",
# "p": "p",
# "m": "m",
# "n": "n",
# "l": "l",
# "s": "s",
# "h": "h",
# "w": "w",
# }
# num_words = {
# 0: "nol",
# 1: "satu",
# 2: "dua",
# 3: "tiga",
# 4: "empat",
# 5: "lima",
# 6: "enam",
# 7: "tujuh",
# 8: "delapan",
# 9: "sembilan",
# 10: "sepuluh",
# 11: "sebelas"
# }
# def number_to_words(n: int) -> str:
# """Konversi angka 0–9999 ke kata dalam bahasa Indonesia"""
# if n < 12:
# return num_words[n]
# elif n < 20:
# return number_to_words(n-10) + " belas"
# elif n < 100:
# puluhan, sisa = divmod(n, 10)
# result = number_to_words(puluhan) + " puluh"
# if sisa:
# result += " " + number_to_words(sisa)
# return result
# elif n < 200:
# return "seratus" + (" " + number_to_words(n-100) if n > 100 else "")
# elif n < 1000:
# ratusan, sisa = divmod(n, 100)
# result = number_to_words(ratusan) + " ratus"
# if sisa:
# result += " " + number_to_words(sisa)
# return result
# elif n < 2000:
# return "seribu" + (" " + number_to_words(n-1000) if n > 1000 else "")
# elif n < 10000:
# ribuan, sisa = divmod(n, 1000)
# result = number_to_words(ribuan) + " ribu"
# if sisa:
# result += " " + number_to_words(sisa)
# return result
# else:
# return str(n) # fallback
# def expand_abbreviation(word: str) -> str:
# """Ubah singkatan (huruf kapital) jadi ucapan Indonesia"""
# if word.isupper() and len(word) > 1: # contoh: KTP, DPR, RI
# return " ".join(letter_words.get(ch.lower(), ch) for ch in word)
# return word
# letter_words = {
# "a": "a",
# "b": "be",
# "c": "ce",
# "d": "de",
# "e": "e",
# "f": "ef",
# "g": "ge",
# "h": "ha",
# "i": "i",
# "j": "je",
# "k": "ka",
# "l": "el",
# "m": "em",
# "n": "en",
# "o": "o",
# "p": "pe",
# "q": "ki",
# "r": "er",
# "s": "es",
# "t": "te",
# "u": "u",
# "v": "fe",
# "w": "we",
# "x": "eks",
# "y": "ye",
# "z": "zet",
# }
# def indo_to_ipa(text: str) -> str:
# text = text.lower()
# # Tangani singkatan (huruf kapital semua)
# words = []
# for w in text.split():
# if w.isupper() and len(w) > 1:
# words.append(expand_abbreviation(w))
# else:
# words.append(w)
# text = " ".join(words)
# # Tangani angka → kata
# def replace_number(match):
# num = int(match.group())
# return number_to_words(num)
# text = re.sub(r"\d+", replace_number, text)
# # Konversi huruf → IPA
# for k in sorted(ipa_map.keys(), key=lambda x: -len(x)):
# text = re.sub(k, ipa_map[k], text)
# return text
# # def indo_to_ipa(text: str) -> str:
# # text = text.lower()
# # # Cari semua angka dalam teks dan ubah ke kata
# # def replace_number(match):
# # num = int(match.group())
# # return number_to_words(num)
# # text = re.sub(r"\d+", replace_number, text)
# # # Konversi huruf → IPA
# # for k in sorted(ipa_map.keys(), key=lambda x: -len(x)):
# # text = re.sub(k, ipa_map[k], text)
# # return text