TTS-Arena-JA

Paused

App Files Files Community

lihaoxin2020 commited on Jul 18, 2024

Commit

8e8c0ba

1 Parent(s): b4e812c

kotoba-speech debug

Browse files

Files changed (1) hide show

app.py +73 -15

app.py CHANGED Viewed

@@ -45,7 +45,8 @@ AVAILABLE_MODELS = {
     # 'VoiceCraft 2.0': 'voicecraft',
     # 'Parler TTS': 'parler'
     'MOE': 'moe',
-    'BARK': 'bark'
 }
 SPACE_ID = os.getenv('SPACE_ID')
@@ -105,10 +106,62 @@ def create_db_if_missing():
             timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
         );
     ''')
 def get_db():
     return sqlite3.connect(DB_PATH)
 ####################################
 # Space initialization
@@ -128,7 +181,6 @@ if not os.path.isfile(DB_PATH):
 create_db_if_missing()
 # Sync local DB with remote repo every 5 minute (only if a change is detected)
-print("[debug]", DB_DATASET_ID)
 scheduler = CommitScheduler(
     repo_id=DB_DATASET_ID,
     repo_type="dataset",
@@ -278,7 +330,8 @@ model_names = {
     # 'metavoice': 'MetaVoice-1B',
     'bark': 'BARK',
     'moe': 'MOE',
-    'styletts2': 'StyleTTS 2',
 }
 model_licenses = {
     'styletts2': 'MIT',
@@ -328,6 +381,7 @@ model_links = {
     # 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
     'bark': 'https://suno-bark.hf.space/',
     'moe': 'skytnt/moe-tts',
 }
 model_kwargs = {
     'moe': {
@@ -335,7 +389,10 @@ model_kwargs = {
     },
     'bark': {
         'fn_index': 3
-    },
 }
 # def get_random_split(existing_split=None):
 #     choice = random.choice(list(audio_dataset.keys()))
@@ -585,14 +642,12 @@ def synthandreturn(text):
         raise gr.Error(f'You did not enter any text')
     # Check language
     try:
-        if not detect(text) == "en":
-            print(text)
-            gr.Warning('Warning: The input text may not be in English')
     except:
         pass
     # Get two random models
     mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
-    # mdl1, mdl2 = "moe1", "moe2"
     log_text(text)
     print("[debug] Using", mdl1, mdl2)
     def predict_and_update_result(text, model, result_storage):
@@ -609,12 +664,15 @@ def synthandreturn(text):
                     ),
                 }
                 # result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
-                router = Client(model_links[model])
-                # debug
-                print(model_args[model])
-                print(model_kwargs[model])
-                result = router.predict(*model_args[model], **model_kwargs[model])
             else:
                 # result = router.predict(text, model.lower(), api_name="/synthesize")
                 # result = router.predict(

     # 'VoiceCraft 2.0': 'voicecraft',
     # 'Parler TTS': 'parler'
     'MOE': 'moe',
+    # 'BARK': 'bark',
+    'KOTOBA-SPEECH': 'kotoba-speech'
 }
 SPACE_ID = os.getenv('SPACE_ID')
             timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
         );
     ''')
 def get_db():
     return sqlite3.connect(DB_PATH)
+def kotoba_speech_tts(text):
+    url = "https://kotoba-tech-kotoba-speech.hf.space/call/tts"
+    headers = {
+        "Content-Type": "application/json"
+    }
+    data = {
+        "data": [
+            text,
+            5,
+            5,
+            "Preset voices",
+            "Ava",
+            {"path": "https://keikaku-hoso.com/sample_voice/voice01_A.mp3"},
+            {"path": "https://keikaku-hoso.com/sample_voice/voice01_A.mp3"}
+        ]
+    }
+    # Send POST request
+    response = requests.post(url, headers=headers, data=json.dumps(data))
+    response.raise_for_status()  # Raise an error for bad status codes
+    # Print the response to inspect its structure
+    print("Response JSON:", response.json())
+    # Extract EVENT_ID from the response
+    response_json = response.json()
+    # if 'data' in response_json and isinstance(response_json['data'], list) and len(response_json['data']) > 0:
+    #     event_id = response_json['data'][0]
+    # else:
+    #     raise KeyError("The key 'data' is not present or does not contain the expected format in the response")
+    event_id = response_json['event_id']
+    # Send GET request to the next URL
+    stream_url = f"https://kotoba-tech-kotoba-speech.hf.space/call/tts/{event_id}"
+    stream_response = requests.get(stream_url, stream=True)
+    stream_response.raise_for_status()  # Raise an error for bad status codes
+    # Process the streamed response
+    for line in stream_response.iter_lines():
+        if line:
+            decoded_line = line.decode('utf-8')
+            print(decoded_line)
+            # try:
+            #     line_json = json.loads('{' + decoded_line + '}')
+            #     if 'data' in line_json:
+            #         print("Data from stream:", line_json['data'])
+            # except json.JSONDecodeError as e:
+            #     print(f"Could not decode line as JSON: {decoded_line}")
+            #     print(f"Error: {e}")
+    parsed_dir = json.loads(decoded_line[6:])[0]['path']
+    print(parsed_dir)
+    return parsed_dir
 ####################################
 # Space initialization
 create_db_if_missing()
 # Sync local DB with remote repo every 5 minute (only if a change is detected)
 scheduler = CommitScheduler(
     repo_id=DB_DATASET_ID,
     repo_type="dataset",
     # 'metavoice': 'MetaVoice-1B',
     'bark': 'BARK',
     'moe': 'MOE',
+    'kotoba-speech': 'KOTOBA-SPEECH'
+    # 'styletts2': 'StyleTTS 2',
 }
 model_licenses = {
     'styletts2': 'MIT',
     # 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
     'bark': 'https://suno-bark.hf.space/',
     'moe': 'skytnt/moe-tts',
+    'kotoba-speech': 'kotoba-tech/kotoba-speech'
 }
 model_kwargs = {
     'moe': {
     },
     'bark': {
         'fn_index': 3
+    },
+    'kotoba-speech': {
+        'api_name': '/tts'
+    }
 }
 # def get_random_split(existing_split=None):
 #     choice = random.choice(list(audio_dataset.keys()))
         raise gr.Error(f'You did not enter any text')
     # Check language
     try:
+        if not detect(text) == "ja":
+            gr.Warning('Warning: The input text may not be in Japanese')
     except:
         pass
     # Get two random models
     mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
     log_text(text)
     print("[debug] Using", mdl1, mdl2)
     def predict_and_update_result(text, model, result_storage):
                     ),
                 }
                 # result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
+                if model != "kotoba-speech":
+                    router = Client(model_links[model])
+                    # debug
+                    print(model_args[model])
+                    print(model_kwargs[model])
+                    result = router.predict(*model_args[model], **model_kwargs[model])
+                else:
+                    result = kotoba_speech_tts(text)
             else:
                 # result = router.predict(text, model.lower(), api_name="/synthesize")
                 # result = router.predict(