Spaces:
Paused
Paused
Commit
·
8e8c0ba
1
Parent(s):
b4e812c
kotoba-speech debug
Browse files
app.py
CHANGED
|
@@ -45,7 +45,8 @@ AVAILABLE_MODELS = {
|
|
| 45 |
# 'VoiceCraft 2.0': 'voicecraft',
|
| 46 |
# 'Parler TTS': 'parler'
|
| 47 |
'MOE': 'moe',
|
| 48 |
-
'BARK': 'bark'
|
|
|
|
| 49 |
}
|
| 50 |
|
| 51 |
SPACE_ID = os.getenv('SPACE_ID')
|
|
@@ -105,10 +106,62 @@ def create_db_if_missing():
|
|
| 105 |
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 106 |
);
|
| 107 |
''')
|
|
|
|
| 108 |
def get_db():
|
| 109 |
return sqlite3.connect(DB_PATH)
|
| 110 |
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
####################################
|
| 114 |
# Space initialization
|
|
@@ -128,7 +181,6 @@ if not os.path.isfile(DB_PATH):
|
|
| 128 |
create_db_if_missing()
|
| 129 |
|
| 130 |
# Sync local DB with remote repo every 5 minute (only if a change is detected)
|
| 131 |
-
print("[debug]", DB_DATASET_ID)
|
| 132 |
scheduler = CommitScheduler(
|
| 133 |
repo_id=DB_DATASET_ID,
|
| 134 |
repo_type="dataset",
|
|
@@ -278,7 +330,8 @@ model_names = {
|
|
| 278 |
# 'metavoice': 'MetaVoice-1B',
|
| 279 |
'bark': 'BARK',
|
| 280 |
'moe': 'MOE',
|
| 281 |
-
'
|
|
|
|
| 282 |
}
|
| 283 |
model_licenses = {
|
| 284 |
'styletts2': 'MIT',
|
|
@@ -328,6 +381,7 @@ model_links = {
|
|
| 328 |
# 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
|
| 329 |
'bark': 'https://suno-bark.hf.space/',
|
| 330 |
'moe': 'skytnt/moe-tts',
|
|
|
|
| 331 |
}
|
| 332 |
model_kwargs = {
|
| 333 |
'moe': {
|
|
@@ -335,7 +389,10 @@ model_kwargs = {
|
|
| 335 |
},
|
| 336 |
'bark': {
|
| 337 |
'fn_index': 3
|
| 338 |
-
},
|
|
|
|
|
|
|
|
|
|
| 339 |
}
|
| 340 |
# def get_random_split(existing_split=None):
|
| 341 |
# choice = random.choice(list(audio_dataset.keys()))
|
|
@@ -585,14 +642,12 @@ def synthandreturn(text):
|
|
| 585 |
raise gr.Error(f'You did not enter any text')
|
| 586 |
# Check language
|
| 587 |
try:
|
| 588 |
-
if not detect(text) == "
|
| 589 |
-
|
| 590 |
-
gr.Warning('Warning: The input text may not be in English')
|
| 591 |
except:
|
| 592 |
pass
|
| 593 |
# Get two random models
|
| 594 |
mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
|
| 595 |
-
# mdl1, mdl2 = "moe1", "moe2"
|
| 596 |
log_text(text)
|
| 597 |
print("[debug] Using", mdl1, mdl2)
|
| 598 |
def predict_and_update_result(text, model, result_storage):
|
|
@@ -609,12 +664,15 @@ def synthandreturn(text):
|
|
| 609 |
),
|
| 610 |
}
|
| 611 |
# result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
|
|
|
|
|
|
|
|
|
| 618 |
else:
|
| 619 |
# result = router.predict(text, model.lower(), api_name="/synthesize")
|
| 620 |
# result = router.predict(
|
|
|
|
| 45 |
# 'VoiceCraft 2.0': 'voicecraft',
|
| 46 |
# 'Parler TTS': 'parler'
|
| 47 |
'MOE': 'moe',
|
| 48 |
+
# 'BARK': 'bark',
|
| 49 |
+
'KOTOBA-SPEECH': 'kotoba-speech'
|
| 50 |
}
|
| 51 |
|
| 52 |
SPACE_ID = os.getenv('SPACE_ID')
|
|
|
|
| 106 |
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
| 107 |
);
|
| 108 |
''')
|
| 109 |
+
|
| 110 |
def get_db():
|
| 111 |
return sqlite3.connect(DB_PATH)
|
| 112 |
|
| 113 |
+
def kotoba_speech_tts(text):
|
| 114 |
+
url = "https://kotoba-tech-kotoba-speech.hf.space/call/tts"
|
| 115 |
+
headers = {
|
| 116 |
+
"Content-Type": "application/json"
|
| 117 |
+
}
|
| 118 |
+
data = {
|
| 119 |
+
"data": [
|
| 120 |
+
text,
|
| 121 |
+
5,
|
| 122 |
+
5,
|
| 123 |
+
"Preset voices",
|
| 124 |
+
"Ava",
|
| 125 |
+
{"path": "https://keikaku-hoso.com/sample_voice/voice01_A.mp3"},
|
| 126 |
+
{"path": "https://keikaku-hoso.com/sample_voice/voice01_A.mp3"}
|
| 127 |
+
]
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
# Send POST request
|
| 131 |
+
response = requests.post(url, headers=headers, data=json.dumps(data))
|
| 132 |
+
response.raise_for_status() # Raise an error for bad status codes
|
| 133 |
+
|
| 134 |
+
# Print the response to inspect its structure
|
| 135 |
+
print("Response JSON:", response.json())
|
| 136 |
+
|
| 137 |
+
# Extract EVENT_ID from the response
|
| 138 |
+
response_json = response.json()
|
| 139 |
+
# if 'data' in response_json and isinstance(response_json['data'], list) and len(response_json['data']) > 0:
|
| 140 |
+
# event_id = response_json['data'][0]
|
| 141 |
+
# else:
|
| 142 |
+
# raise KeyError("The key 'data' is not present or does not contain the expected format in the response")
|
| 143 |
+
event_id = response_json['event_id']
|
| 144 |
+
|
| 145 |
+
# Send GET request to the next URL
|
| 146 |
+
stream_url = f"https://kotoba-tech-kotoba-speech.hf.space/call/tts/{event_id}"
|
| 147 |
+
stream_response = requests.get(stream_url, stream=True)
|
| 148 |
+
stream_response.raise_for_status() # Raise an error for bad status codes
|
| 149 |
+
|
| 150 |
+
# Process the streamed response
|
| 151 |
+
for line in stream_response.iter_lines():
|
| 152 |
+
if line:
|
| 153 |
+
decoded_line = line.decode('utf-8')
|
| 154 |
+
print(decoded_line)
|
| 155 |
+
# try:
|
| 156 |
+
# line_json = json.loads('{' + decoded_line + '}')
|
| 157 |
+
# if 'data' in line_json:
|
| 158 |
+
# print("Data from stream:", line_json['data'])
|
| 159 |
+
# except json.JSONDecodeError as e:
|
| 160 |
+
# print(f"Could not decode line as JSON: {decoded_line}")
|
| 161 |
+
# print(f"Error: {e}")
|
| 162 |
+
parsed_dir = json.loads(decoded_line[6:])[0]['path']
|
| 163 |
+
print(parsed_dir)
|
| 164 |
+
return parsed_dir
|
| 165 |
|
| 166 |
####################################
|
| 167 |
# Space initialization
|
|
|
|
| 181 |
create_db_if_missing()
|
| 182 |
|
| 183 |
# Sync local DB with remote repo every 5 minute (only if a change is detected)
|
|
|
|
| 184 |
scheduler = CommitScheduler(
|
| 185 |
repo_id=DB_DATASET_ID,
|
| 186 |
repo_type="dataset",
|
|
|
|
| 330 |
# 'metavoice': 'MetaVoice-1B',
|
| 331 |
'bark': 'BARK',
|
| 332 |
'moe': 'MOE',
|
| 333 |
+
'kotoba-speech': 'KOTOBA-SPEECH'
|
| 334 |
+
# 'styletts2': 'StyleTTS 2',
|
| 335 |
}
|
| 336 |
model_licenses = {
|
| 337 |
'styletts2': 'MIT',
|
|
|
|
| 381 |
# 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
|
| 382 |
'bark': 'https://suno-bark.hf.space/',
|
| 383 |
'moe': 'skytnt/moe-tts',
|
| 384 |
+
'kotoba-speech': 'kotoba-tech/kotoba-speech'
|
| 385 |
}
|
| 386 |
model_kwargs = {
|
| 387 |
'moe': {
|
|
|
|
| 389 |
},
|
| 390 |
'bark': {
|
| 391 |
'fn_index': 3
|
| 392 |
+
},
|
| 393 |
+
'kotoba-speech': {
|
| 394 |
+
'api_name': '/tts'
|
| 395 |
+
}
|
| 396 |
}
|
| 397 |
# def get_random_split(existing_split=None):
|
| 398 |
# choice = random.choice(list(audio_dataset.keys()))
|
|
|
|
| 642 |
raise gr.Error(f'You did not enter any text')
|
| 643 |
# Check language
|
| 644 |
try:
|
| 645 |
+
if not detect(text) == "ja":
|
| 646 |
+
gr.Warning('Warning: The input text may not be in Japanese')
|
|
|
|
| 647 |
except:
|
| 648 |
pass
|
| 649 |
# Get two random models
|
| 650 |
mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
|
|
|
|
| 651 |
log_text(text)
|
| 652 |
print("[debug] Using", mdl1, mdl2)
|
| 653 |
def predict_and_update_result(text, model, result_storage):
|
|
|
|
| 664 |
),
|
| 665 |
}
|
| 666 |
# result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
|
| 667 |
+
if model != "kotoba-speech":
|
| 668 |
+
router = Client(model_links[model])
|
| 669 |
+
# debug
|
| 670 |
+
print(model_args[model])
|
| 671 |
+
print(model_kwargs[model])
|
| 672 |
+
|
| 673 |
+
result = router.predict(*model_args[model], **model_kwargs[model])
|
| 674 |
+
else:
|
| 675 |
+
result = kotoba_speech_tts(text)
|
| 676 |
else:
|
| 677 |
# result = router.predict(text, model.lower(), api_name="/synthesize")
|
| 678 |
# result = router.predict(
|