Spaces:
Paused
Paused
alan
commited on
Commit
·
05d581b
1
Parent(s):
cc744ff
added blane-tts
Browse files
app.py
CHANGED
|
@@ -9,7 +9,7 @@ from gradio_client import Client
|
|
| 9 |
import pyloudnorm as pyln
|
| 10 |
import soundfile as sf
|
| 11 |
import librosa
|
| 12 |
-
from detoxify import Detoxify
|
| 13 |
import os
|
| 14 |
import tempfile
|
| 15 |
from pydub import AudioSegment
|
|
@@ -27,7 +27,7 @@ def match_target_amplitude(sound, target_dBFS):
|
|
| 27 |
|
| 28 |
|
| 29 |
|
| 30 |
-
toxicity = Detoxify('original')
|
| 31 |
# with open('harvard_sentences.txt') as f:
|
| 32 |
with open('ja_sentences.txt') as f:
|
| 33 |
sents = f.read().strip().splitlines()
|
|
@@ -50,7 +50,8 @@ AVAILABLE_MODELS = {
|
|
| 50 |
# 'Parler TTS': 'parler'
|
| 51 |
'MOE': 'moe',
|
| 52 |
'BARK': 'bark',
|
| 53 |
-
'KOTOBA-SPEECH': 'kotoba-speech'
|
|
|
|
| 54 |
}
|
| 55 |
|
| 56 |
SPACE_ID = os.getenv('SPACE_ID')
|
|
@@ -63,12 +64,12 @@ DB_NAME = "database.db"
|
|
| 63 |
DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME
|
| 64 |
print(f"Using {DB_PATH}")
|
| 65 |
# AUDIO_DATASET_ID = "ttseval/tts-arena-new"
|
| 66 |
-
CITATION_TEXT = """@misc{tts-arena,
|
| 67 |
-
title = {Text to Speech Arena},
|
| 68 |
-
author = {
|
| 69 |
year = 2024,
|
| 70 |
publisher = {Hugging Face},
|
| 71 |
-
howpublished = "\\url{https://huggingface.co/spaces/
|
| 72 |
}"""
|
| 73 |
|
| 74 |
####################################
|
|
@@ -114,25 +115,36 @@ def create_db_if_missing():
|
|
| 114 |
def get_db():
|
| 115 |
return sqlite3.connect(DB_PATH)
|
| 116 |
|
| 117 |
-
def
|
| 118 |
-
url =
|
|
|
|
|
|
|
|
|
|
| 119 |
headers = {
|
| 120 |
"Content-Type": "application/json"
|
| 121 |
}
|
| 122 |
data = {
|
| 123 |
-
"
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
}
|
| 133 |
|
| 134 |
# Send POST request
|
| 135 |
-
response = requests.post(url, headers=headers, data=json.dumps(data))
|
| 136 |
response.raise_for_status() # Raise an error for bad status codes
|
| 137 |
|
| 138 |
# Print the response to inspect its structure
|
|
@@ -147,7 +159,7 @@ def kotoba_speech_tts(text):
|
|
| 147 |
event_id = response_json['event_id']
|
| 148 |
|
| 149 |
# Send GET request to the next URL
|
| 150 |
-
stream_url = f"
|
| 151 |
stream_response = requests.get(stream_url, stream=True)
|
| 152 |
stream_response.raise_for_status() # Raise an error for bad status codes
|
| 153 |
|
|
@@ -220,6 +232,8 @@ DESCR = """
|
|
| 220 |
# Japanese TTS Arena: Benchmarking Japanese TTS Models in the Wild
|
| 221 |
|
| 222 |
Vote to help the community find the best available text-to-speech model!
|
|
|
|
|
|
|
| 223 |
""".strip()
|
| 224 |
# INSTR = """
|
| 225 |
# ## Instructions
|
|
@@ -344,7 +358,8 @@ model_names = {
|
|
| 344 |
# 'metavoice': 'MetaVoice-1B',
|
| 345 |
'bark': 'BARK',
|
| 346 |
'moe': 'MOE',
|
| 347 |
-
'kotoba-speech': 'KOTOBA-SPEECH'
|
|
|
|
| 348 |
# 'styletts2': 'StyleTTS 2',
|
| 349 |
}
|
| 350 |
model_licenses = {
|
|
@@ -395,7 +410,8 @@ model_links = {
|
|
| 395 |
# 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
|
| 396 |
'bark': 'https://suno-bark.hf.space/',
|
| 397 |
'moe': 'skytnt/moe-tts',
|
| 398 |
-
|
|
|
|
| 399 |
}
|
| 400 |
model_kwargs = {
|
| 401 |
'moe': {
|
|
@@ -404,9 +420,6 @@ model_kwargs = {
|
|
| 404 |
'bark': {
|
| 405 |
'fn_index': 3
|
| 406 |
},
|
| 407 |
-
# 'kotoba-speech': {
|
| 408 |
-
# 'api_name': '/tts'
|
| 409 |
-
# }
|
| 410 |
}
|
| 411 |
# def get_random_split(existing_split=None):
|
| 412 |
# choice = random.choice(list(audio_dataset.keys()))
|
|
@@ -686,9 +699,9 @@ def synthandreturn(text):
|
|
| 686 |
|
| 687 |
result = router.predict(*model_args[model], **model_kwargs[model])
|
| 688 |
else:
|
| 689 |
-
result =
|
| 690 |
# URL to download the file from
|
| 691 |
-
url = f"
|
| 692 |
# Local filename to save the downloaded file
|
| 693 |
local_filename = '/tmp/' + str(mkuuid(None)) + '.wav'
|
| 694 |
|
|
@@ -913,17 +926,17 @@ with gr.Blocks() as vote:
|
|
| 913 |
|
| 914 |
with gr.Blocks() as about:
|
| 915 |
gr.Markdown(ABOUT)
|
| 916 |
-
with gr.Blocks() as admin:
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
with gr.Blocks(theme=theme, css="footer {visibility: hidden}textbox{resize:none}", title="TTS Arena copy") as demo:
|
| 924 |
gr.Markdown(DESCR)
|
| 925 |
-
gr.TabbedInterface([vote, leaderboard, about, admin], ['Vote', 'Leaderboard', 'About', 'Admin (ONLY IN BETA)'])
|
| 926 |
-
|
| 927 |
if CITATION_TEXT:
|
| 928 |
with gr.Row():
|
| 929 |
with gr.Accordion("Citation", open=False):
|
|
|
|
| 9 |
import pyloudnorm as pyln
|
| 10 |
import soundfile as sf
|
| 11 |
import librosa
|
| 12 |
+
# from detoxify import Detoxify
|
| 13 |
import os
|
| 14 |
import tempfile
|
| 15 |
from pydub import AudioSegment
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
|
| 30 |
+
# toxicity = Detoxify('original')
|
| 31 |
# with open('harvard_sentences.txt') as f:
|
| 32 |
with open('ja_sentences.txt') as f:
|
| 33 |
sents = f.read().strip().splitlines()
|
|
|
|
| 50 |
# 'Parler TTS': 'parler'
|
| 51 |
'MOE': 'moe',
|
| 52 |
'BARK': 'bark',
|
| 53 |
+
'KOTOBA-SPEECH': 'kotoba-speech',
|
| 54 |
+
'BLANE-TTS': 'blane-tts'
|
| 55 |
}
|
| 56 |
|
| 57 |
SPACE_ID = os.getenv('SPACE_ID')
|
|
|
|
| 64 |
DB_PATH = f"/data/{DB_NAME}" if os.path.isdir("/data") else DB_NAME
|
| 65 |
print(f"Using {DB_PATH}")
|
| 66 |
# AUDIO_DATASET_ID = "ttseval/tts-arena-new"
|
| 67 |
+
CITATION_TEXT = """@misc{tts-arena-ja,
|
| 68 |
+
title = {Japanese Text to Speech Arena},
|
| 69 |
+
author = {Kotoba Technologies.},
|
| 70 |
year = 2024,
|
| 71 |
publisher = {Hugging Face},
|
| 72 |
+
howpublished = "\\url{https://huggingface.co/spaces/kotoba-speech/TTS-Arena-copy}"
|
| 73 |
}"""
|
| 74 |
|
| 75 |
####################################
|
|
|
|
| 115 |
def get_db():
|
| 116 |
return sqlite3.connect(DB_PATH)
|
| 117 |
|
| 118 |
+
def get_tts_file(text: str, model: str):
|
| 119 |
+
url = {
|
| 120 |
+
"kotoba-speech": "https://kotoba-tech-kotoba-speech.hf.space/call/tts",
|
| 121 |
+
"blane-tts": "https://blane187-blane-tts.hf.space/call/get_audio_file"
|
| 122 |
+
}
|
| 123 |
headers = {
|
| 124 |
"Content-Type": "application/json"
|
| 125 |
}
|
| 126 |
data = {
|
| 127 |
+
"kotoba-speech": {
|
| 128 |
+
"data": [
|
| 129 |
+
text,
|
| 130 |
+
5,
|
| 131 |
+
5,
|
| 132 |
+
"Preset voices",
|
| 133 |
+
"Ava",
|
| 134 |
+
{"path": "https://keikaku-hoso.com/sample_voice/voice01_A.mp3"},
|
| 135 |
+
{"path": "https://keikaku-hoso.com/sample_voice/voice01_A.mp3"}
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
"blane-tts": {
|
| 139 |
+
"data": [
|
| 140 |
+
text,
|
| 141 |
+
"Japanese"
|
| 142 |
+
]
|
| 143 |
+
}
|
| 144 |
}
|
| 145 |
|
| 146 |
# Send POST request
|
| 147 |
+
response = requests.post(url[model], headers=headers, data=json.dumps(data[model]))
|
| 148 |
response.raise_for_status() # Raise an error for bad status codes
|
| 149 |
|
| 150 |
# Print the response to inspect its structure
|
|
|
|
| 159 |
event_id = response_json['event_id']
|
| 160 |
|
| 161 |
# Send GET request to the next URL
|
| 162 |
+
stream_url = f"{url[model]}/{event_id}"
|
| 163 |
stream_response = requests.get(stream_url, stream=True)
|
| 164 |
stream_response.raise_for_status() # Raise an error for bad status codes
|
| 165 |
|
|
|
|
| 232 |
# Japanese TTS Arena: Benchmarking Japanese TTS Models in the Wild
|
| 233 |
|
| 234 |
Vote to help the community find the best available text-to-speech model!
|
| 235 |
+
|
| 236 |
+
_This arena is inspired and built on [TTS Arena](https://huggingface.co/spaces/TTS-AGI/TTS-Arena)._
|
| 237 |
""".strip()
|
| 238 |
# INSTR = """
|
| 239 |
# ## Instructions
|
|
|
|
| 358 |
# 'metavoice': 'MetaVoice-1B',
|
| 359 |
'bark': 'BARK',
|
| 360 |
'moe': 'MOE',
|
| 361 |
+
'kotoba-speech': 'KOTOBA-SPEECH',
|
| 362 |
+
'blane-tts': 'BLANE-TTS'
|
| 363 |
# 'styletts2': 'StyleTTS 2',
|
| 364 |
}
|
| 365 |
model_licenses = {
|
|
|
|
| 410 |
# 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
|
| 411 |
'bark': 'https://suno-bark.hf.space/',
|
| 412 |
'moe': 'skytnt/moe-tts',
|
| 413 |
+
'kotoba-speech': 'https://kotoba-tech-kotoba-speech.hf.space/',
|
| 414 |
+
'blane-tts': 'https://blane187-blane-tts.hf.space/'
|
| 415 |
}
|
| 416 |
model_kwargs = {
|
| 417 |
'moe': {
|
|
|
|
| 420 |
'bark': {
|
| 421 |
'fn_index': 3
|
| 422 |
},
|
|
|
|
|
|
|
|
|
|
| 423 |
}
|
| 424 |
# def get_random_split(existing_split=None):
|
| 425 |
# choice = random.choice(list(audio_dataset.keys()))
|
|
|
|
| 699 |
|
| 700 |
result = router.predict(*model_args[model], **model_kwargs[model])
|
| 701 |
else:
|
| 702 |
+
result = get_tts_file(text)
|
| 703 |
# URL to download the file from
|
| 704 |
+
url = f"{model_links[model]}file={result}"
|
| 705 |
# Local filename to save the downloaded file
|
| 706 |
local_filename = '/tmp/' + str(mkuuid(None)) + '.wav'
|
| 707 |
|
|
|
|
| 926 |
|
| 927 |
with gr.Blocks() as about:
|
| 928 |
gr.Markdown(ABOUT)
|
| 929 |
+
# with gr.Blocks() as admin:
|
| 930 |
+
# rdb = gr.Button("Reload Audio Dataset")
|
| 931 |
+
# # rdb.click(reload_audio_dataset, outputs=rdb)
|
| 932 |
+
# with gr.Group():
|
| 933 |
+
# dbtext = gr.Textbox(label="Type \"delete db\" to confirm", placeholder="delete db")
|
| 934 |
+
# ddb = gr.Button("Delete DB")
|
| 935 |
+
# ddb.click(del_db, inputs=dbtext, outputs=ddb)
|
| 936 |
with gr.Blocks(theme=theme, css="footer {visibility: hidden}textbox{resize:none}", title="TTS Arena copy") as demo:
|
| 937 |
gr.Markdown(DESCR)
|
| 938 |
+
# gr.TabbedInterface([vote, leaderboard, about, admin], ['Vote', 'Leaderboard', 'About', 'Admin (ONLY IN BETA)'])
|
| 939 |
+
gr.TabbedInterface([vote, leaderboard, about], ['🗳️ Vote', '🏆 Leaderboard', '📄 About'])
|
| 940 |
if CITATION_TEXT:
|
| 941 |
with gr.Row():
|
| 942 |
with gr.Accordion("Citation", open=False):
|