|
|
import gradio as gr |
|
|
from transformers import VitsModel, AutoTokenizer |
|
|
import torch |
|
|
import numpy as np |
|
|
import soundfile as sf |
|
|
import io |
|
|
import os |
|
|
import string |
|
|
import unicodedata |
|
|
from pypinyin import pinyin, Style |
|
|
import re |
|
|
from umsc import UgMultiScriptConverter |
|
|
|
|
|
|
|
|
ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS') |
|
|
ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS') |
|
|
|
|
|
from huggingface_hub import login |
|
|
|
|
|
if os.environ.get("HF_TOKEN"): |
|
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
|
|
|
def number_to_uyghur_arabic_script(number_str): |
|
|
""" |
|
|
Converts a number (integer, decimal, fraction, percentage, or ordinal) up to 9 digits (integer and decimal) |
|
|
to its Uyghur pronunciation in Arabic script. Decimal part is pronounced as a whole number with a fractional term. |
|
|
Ordinals use the -ىنجى suffix for all numbers up to 9 digits, with special forms for single digits. |
|
|
|
|
|
Args: |
|
|
number_str (str): Number as a string (e.g., '123', '0.001', '1/4', '25%', '1968_', '123456789'). |
|
|
|
|
|
Returns: |
|
|
str: Uyghur pronunciation in Arabic script. |
|
|
""" |
|
|
|
|
|
digits = { |
|
|
0: 'نۆل', 1: 'بىر', 2: 'ئىككى', 3: 'ئۈچ', 4: 'تۆت', 5: 'بەش', |
|
|
6: 'ئالتە', 7: 'يەتتە', 8: 'سەككىز', 9: 'توققۇز' |
|
|
} |
|
|
ordinals = { |
|
|
1: 'بىرىنجى', 2: 'ئىككىنجى', 3: 'ئۈچىنجى', 4: 'تۆتىنجى', 5: 'بەشىنجى', |
|
|
6: 'ئالتىنجى', 7: 'يەتتىنجى', 8: 'سەككىزىنجى', 9: 'توققۇزىنجى' |
|
|
} |
|
|
tens = { |
|
|
10: 'ئون', 20: 'يىگىرمە', 30: 'ئوتتۇز', 40: 'قىرىق', 50: 'ئەللىك', |
|
|
60: 'ئاتمىش', 70: 'يەتمىش', 80: 'سەكسەن', 90: 'توقسان' |
|
|
} |
|
|
units = [ |
|
|
(1000000000, 'مىليارد'), |
|
|
(1000000, 'مىليون'), |
|
|
(1000, 'مىڭ'), |
|
|
(100, 'يۈز') |
|
|
] |
|
|
fractions = { |
|
|
1: 'ئوندا', |
|
|
2: 'يۈزدە', |
|
|
3: 'مىڭدە', |
|
|
4: 'ئون مىڭدە', |
|
|
5: 'يۈز مىڭدە', |
|
|
6: 'مىليوندا', |
|
|
7: 'ئون مىليوندا', |
|
|
8: 'يۈز مىليوندا', |
|
|
9: 'مىليارددا' |
|
|
} |
|
|
|
|
|
|
|
|
def integer_to_words(num): |
|
|
if num == 0: |
|
|
return digits[0] |
|
|
|
|
|
result = [] |
|
|
num = int(num) |
|
|
|
|
|
|
|
|
for value, unit_name in units: |
|
|
if num >= value: |
|
|
count = num // value |
|
|
if count == 1 and value >= 100: |
|
|
result.append(unit_name) |
|
|
else: |
|
|
result.append(integer_to_words(count) + ' ' + unit_name) |
|
|
num %= value |
|
|
|
|
|
|
|
|
if num >= 10 and num in tens: |
|
|
result.append(tens[num]) |
|
|
elif num > 10: |
|
|
ten = (num // 10) * 10 |
|
|
one = num % 10 |
|
|
if one == 0: |
|
|
result.append(tens[ten]) |
|
|
else: |
|
|
result.append(tens[ten] + ' ' + digits[one]) |
|
|
elif num > 0: |
|
|
result.append(digits[num]) |
|
|
|
|
|
return ' '.join(result) |
|
|
|
|
|
|
|
|
number_str = number_str.replace(',', '').replace(' ', '') |
|
|
|
|
|
|
|
|
is_ordinal = number_str.endswith('_') or number_str.endswith('-') |
|
|
if is_ordinal: |
|
|
number_str = number_str[:-1] |
|
|
num = int(number_str) |
|
|
if num > 999999999: |
|
|
|
|
|
return number_str |
|
|
if num in ordinals: |
|
|
return ordinals[num] |
|
|
|
|
|
|
|
|
words = integer_to_words(num).split() |
|
|
last_num = num % 100 |
|
|
if last_num in tens: |
|
|
words[-1] = tens[last_num] + 'ىنجى ' |
|
|
elif last_num % 10 == 0 and last_num > 0: |
|
|
words[-1] = tens[last_num] + 'ىنجى ' |
|
|
else: |
|
|
last_digit = num % 10 |
|
|
if last_digit in ordinals: |
|
|
words[-1] = ordinals[last_digit] + ' ' |
|
|
elif last_digit == 0: |
|
|
words[-1] += 'ىنجى' |
|
|
return ' '.join(words) |
|
|
|
|
|
|
|
|
is_percentage = number_str.endswith('%') |
|
|
if is_percentage: |
|
|
number_str = number_str[:-1] |
|
|
|
|
|
|
|
|
if '/' in number_str: |
|
|
numerator, denominator = map(int, number_str.split('/')) |
|
|
if numerator in digits and denominator in digits: |
|
|
return f"{digits[denominator]}دە {digits[numerator]}" |
|
|
else: |
|
|
|
|
|
return number_str |
|
|
|
|
|
|
|
|
parts = number_str.split('.') |
|
|
integer_part = parts[0] |
|
|
decimal_part = parts[1] if len(parts) > 1 else None |
|
|
|
|
|
|
|
|
if len(integer_part) > 9: |
|
|
|
|
|
return number_str |
|
|
|
|
|
|
|
|
if decimal_part and len(decimal_part) > 9: |
|
|
|
|
|
return number_str |
|
|
|
|
|
|
|
|
pronunciation = integer_to_words(int(integer_part)) |
|
|
|
|
|
|
|
|
if decimal_part: |
|
|
pronunciation += ' پۈتۈن' |
|
|
if decimal_part != '0': |
|
|
decimal_value = int(decimal_part.rstrip('0')) |
|
|
decimal_places = len(decimal_part.rstrip('0')) |
|
|
fraction_term = fractions.get(decimal_places, 'مىليارددا') |
|
|
pronunciation += ' ' + fraction_term + ' ' + integer_to_words(decimal_value) |
|
|
|
|
|
|
|
|
if is_percentage: |
|
|
pronunciation += ' پىرسەنت' |
|
|
|
|
|
return pronunciation.strip() |
|
|
|
|
|
|
|
|
|
|
|
def process_uyghur_text_with_numbers(text): |
|
|
""" |
|
|
Processes a string containing Uyghur text and numbers, converting valid numbers to their |
|
|
Uyghur pronunciation in Arabic script while preserving non-numeric text. |
|
|
|
|
|
Args: |
|
|
text (str): Input string with Uyghur text and numbers (e.g., '1/4 كىلو 25% تەملىك'). |
|
|
|
|
|
Returns: |
|
|
str: String with numbers converted to Uyghur pronunciation, non-numeric text preserved. |
|
|
""" |
|
|
text = text.replace('%', ' پىرسەنت ') |
|
|
|
|
|
digits = '0123456789' |
|
|
number_symbols = '/.%_-' |
|
|
|
|
|
result = [] |
|
|
i = 0 |
|
|
while i < len(text): |
|
|
|
|
|
if text[i].isspace(): |
|
|
result.append(text[i]) |
|
|
i += 1 |
|
|
continue |
|
|
|
|
|
|
|
|
number_start = i |
|
|
number_str = '' |
|
|
is_number = False |
|
|
|
|
|
|
|
|
while i < len(text) and (text[i] in digits or text[i] in number_symbols): |
|
|
number_str += text[i] |
|
|
i += 1 |
|
|
is_number = True |
|
|
|
|
|
|
|
|
if is_number: |
|
|
|
|
|
valid = False |
|
|
if '/' in number_str and number_str.count('/') == 1: |
|
|
|
|
|
num, denom = number_str.split('/') |
|
|
if num.isdigit() and denom.isdigit(): |
|
|
valid = True |
|
|
elif number_str.endswith('%'): |
|
|
|
|
|
if number_str[:-1].isdigit(): |
|
|
valid = True |
|
|
elif number_str.endswith('_') or number_str.endswith('-'): |
|
|
|
|
|
if number_str[:-1].isdigit(): |
|
|
valid = True |
|
|
elif '.' in number_str and number_str.count('.') == 1: |
|
|
|
|
|
whole, frac = number_str.split('.') |
|
|
if whole.isdigit() and frac.isdigit(): |
|
|
valid = True |
|
|
elif number_str.isdigit(): |
|
|
|
|
|
valid = True |
|
|
|
|
|
if valid: |
|
|
try: |
|
|
|
|
|
converted = number_to_uyghur_arabic_script(number_str) |
|
|
result.append(converted) |
|
|
except ValueError: |
|
|
|
|
|
result.append(number_str) |
|
|
else: |
|
|
|
|
|
result.append(number_str) |
|
|
else: |
|
|
|
|
|
result.append(text[i]) |
|
|
i += 1 |
|
|
|
|
|
|
|
|
return ''.join(result) |
|
|
|
|
|
def fix_pauctuations(batch): |
|
|
batch = batch.lower() |
|
|
batch = unicodedata.normalize('NFKC', batch) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
batch = batch.replace('ژ', 'ج') |
|
|
batch = batch.replace('ک', 'ك') |
|
|
batch = batch.replace('ی', 'ى') |
|
|
batch = batch.replace('ه', 'ە') |
|
|
|
|
|
vocab = [" ", "ئ", "ا", "ب", "ت", "ج", "خ", "د", "ر", "ز", "س", "ش", "غ", "ف", "ق", "ك", "ل", "م", "ن", "و", "ى", "ي", "پ", "چ", "ڭ", "گ", "ھ", "ۆ", "ۇ", "ۈ", "ۋ", "ې", "ە"] |
|
|
|
|
|
|
|
|
result = [] |
|
|
for char in batch: |
|
|
if char in vocab: |
|
|
result.append(char) |
|
|
elif char in {'.', '?', '؟'}: |
|
|
result.append(' ') |
|
|
else: |
|
|
result.append(' ') |
|
|
|
|
|
|
|
|
return ''.join(result) |
|
|
|
|
|
def chinese_to_pinyin(mixed_text): |
|
|
""" |
|
|
Convert Chinese characters in a mixed-language string to Pinyin without tone marks, |
|
|
preserving non-Chinese text, using only English letters. |
|
|
|
|
|
Args: |
|
|
mixed_text (str): Input string containing Chinese characters and other languages (e.g., English, Uyghur) |
|
|
|
|
|
Returns: |
|
|
str: String with Chinese characters converted to Pinyin (no tone marks), non-Chinese text unchanged |
|
|
""" |
|
|
|
|
|
chinese_pattern = re.compile(r'[\u4e00-\u9fff]+') |
|
|
|
|
|
def replace_chinese(match): |
|
|
chinese_text = match.group(0) |
|
|
|
|
|
pinyin_list = pinyin(chinese_text, style=Style.NORMAL) |
|
|
return ' '.join([item[0] for item in pinyin_list]) |
|
|
|
|
|
|
|
|
result = chinese_pattern.sub(replace_chinese, mixed_text) |
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
MODEL_OPTIONS = { |
|
|
|
|
|
|
|
|
|
|
|
"Uyghur (Arabic script, CV_Unique-2)": "piyazon/TTS-CV-Unique-Ug-2", |
|
|
"Uyghur (Arabic script, Roman-Girl_Ug)": "piyazon/TTS-Roman-Girl-Ug", |
|
|
|
|
|
|
|
|
"Uyghur (Arabic script, QutadguBilik)": "piyazon/qutadgu_bilik", |
|
|
"Uyghur (Arabic script, MMS-TTS)": "facebook/mms-tts-uig-script_arabic", |
|
|
} |
|
|
|
|
|
|
|
|
model_cache = {} |
|
|
tokenizer_cache = {} |
|
|
|
|
|
def load_model_and_tokenizer(model_name): |
|
|
|
|
|
if model_name not in model_cache: |
|
|
model_cache[model_name] = VitsModel.from_pretrained(MODEL_OPTIONS[model_name]) |
|
|
tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(MODEL_OPTIONS[model_name]) |
|
|
return model_cache[model_name], tokenizer_cache[model_name] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def text_to_speech(text, model_name): |
|
|
|
|
|
model, tokenizer = load_model_and_tokenizer(model_name) |
|
|
|
|
|
fixted_text = fix_pauctuations(process_uyghur_text_with_numbers(ug_latn_to_arab(chinese_to_pinyin(text)))) |
|
|
print(fixted_text) |
|
|
|
|
|
inputs = tokenizer(fixted_text, return_tensors="pt") |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
output = model(**inputs).waveform |
|
|
|
|
|
|
|
|
audio_data = output.squeeze().numpy() |
|
|
sample_rate = model.config.sampling_rate |
|
|
|
|
|
|
|
|
temp_file = "output.wav" |
|
|
sf.write(temp_file, audio_data, sample_rate) |
|
|
|
|
|
|
|
|
with open(temp_file, "rb") as f: |
|
|
audio_bytes = f.read() |
|
|
|
|
|
|
|
|
os.remove(temp_file) |
|
|
|
|
|
return audio_bytes |
|
|
|
|
|
|
|
|
examples = [ |
|
|
|
|
|
["ئامېرىكا ئارمىيەسى 1945-يىلى 7-ئاينىڭ 16-كۈنى دۇنيا بويىچە تۇنجى قېتىم« ئۈچنى بىر گەۋدىلەشتۈرۈش» يادرو سىنىقىنى ئېلىپ باردى", "Uyghur (Arabic script, CV_Unique-2)"], |
|
|
|
|
|
|
|
|
["بىز ئىنسانلارنىڭ ھەممىسى بىرلىكتە ياشايمىز. ھەر بىر ئادەم ئۆزىنىڭ يولىنى تاللىيالايدۇ.", "Uyghur (Arabic script, QutadguBilik)"], |
|
|
["بۇ بىر گۈزەل كۈن، ھەممەيلەن بىرلىكتە خۇشال بولايلى. 5 كىشى بىللە ئويۇن ئوينايدۇ.", "Uyghur (Arabic script, MMS-TTS)"], |
|
|
] |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=text_to_speech, |
|
|
inputs=[ |
|
|
gr.Textbox( |
|
|
label="Enter text to convert to speech", |
|
|
elem_classes="rtl-text", |
|
|
elem_id="input-textbox", |
|
|
lines=6, |
|
|
max_lines=15 |
|
|
), |
|
|
gr.Dropdown( |
|
|
choices=list(MODEL_OPTIONS.keys()), |
|
|
label="Select TTS Model", |
|
|
value="Uyghur (Arabic script, CV_Unique-2)" |
|
|
) |
|
|
], |
|
|
outputs=gr.Audio(label="Generated Speech", type="filepath"), |
|
|
title="Text-to-Speech with MMS-TTS Models", |
|
|
description=""" |
|
|
Uyghur Text To Speech<br> |
|
|
<strong style="color:red;">Warning:</strong> This Gradio app is just a demo of Uyghur TTS. For privacy purposes, these voices should not be used for business or personal projects. Anyone wanting to use Uyghur TTS should clone their own voice or obtain authorization from the voice owner to train their own TTS model. For fine-tuning instructions, visit <a href='https://github.com/ylacombe/finetune-hf-vits' target='_blank'>this GitHub repository</a>. |
|
|
""", |
|
|
examples=examples, |
|
|
css=""" |
|
|
@import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic&display=swap'); |
|
|
.rtl-text textarea { |
|
|
direction: rtl; |
|
|
width: 100%; |
|
|
height: 200px; |
|
|
font-size: 17px; |
|
|
font-family: "Noto Sans Arabic" !important; |
|
|
} |
|
|
.table-wrap{ |
|
|
font-family: "Noto Sans Arabic" !important; |
|
|
} |
|
|
""" |
|
|
) |
|
|
|
|
|
demo.launch() |
|
|
|
|
|
|