TTS-Arena-JA

Paused

App Files Files Community

alan commited on Jul 22, 2024

Commit

0414b49

1 Parent(s): 947b8c3

added google api

Browse files

Files changed (2) hide show

app.py +11 -4
utils.py +47 -0

app.py CHANGED Viewed

@@ -15,7 +15,11 @@ import tempfile
 from pydub import AudioSegment
 import requests
 import json
 def match_target_amplitude(sound, target_dBFS):
     change_in_dBFS = target_dBFS - sound.dBFS
@@ -25,8 +29,6 @@ def match_target_amplitude(sound, target_dBFS):
 # enable_space_ci()
 # toxicity = Detoxify('original')
 # with open('harvard_sentences.txt') as f:
 with open('ja_sentences.txt') as f:
@@ -55,7 +57,8 @@ AVAILABLE_MODELS = {
     'KOTOBA-SPEECH-ALEX': 'kotoba-speech-alex',
     'KOTOBA-SPEECH-JACOB': 'kotoba-speech-jacob',
     'BLANE-TTS': 'blane-tts',
-    'AMITARO-VITS': 'amitaro-vits'
 }
 SPACE_ID = os.getenv('SPACE_ID')
@@ -392,7 +395,8 @@ model_names = {
     'kotoba-speech-alex': 'KOTOBA-SPEECH-v0.1-ALEX',
     'kotoba-speech-jacob': 'KOTOBA-SPEECH-v0.1-JACOB',
     'blane-tts': 'BLANE-TTS',
-    'amitaro-vits': 'AMITARO-VITS'
     # 'styletts2': 'StyleTTS 2',
 }
 model_licenses = {
@@ -740,6 +744,9 @@ def synthandreturn(text):
                     print(model_args[model])
                     print(model_kwargs[model])
                     result = router.predict(*model_args[model], **model_kwargs[model])
                 else:
                     result = get_tts_file(text, model)
                     # URL to download the file from

 from pydub import AudioSegment
 import requests
 import json
+from google.cloud import texttospeech
+from utils import get_credentials, get_google_tts
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = get_credentials()
 def match_target_amplitude(sound, target_dBFS):
     change_in_dBFS = target_dBFS - sound.dBFS
 # enable_space_ci()
 # toxicity = Detoxify('original')
 # with open('harvard_sentences.txt') as f:
 with open('ja_sentences.txt') as f:
     'KOTOBA-SPEECH-ALEX': 'kotoba-speech-alex',
     'KOTOBA-SPEECH-JACOB': 'kotoba-speech-jacob',
     'BLANE-TTS': 'blane-tts',
+    'AMITARO-VITS': 'amitaro-vits',
+    'GOOGLE-API': 'google-api'
 }
 SPACE_ID = os.getenv('SPACE_ID')
     'kotoba-speech-alex': 'KOTOBA-SPEECH-v0.1-ALEX',
     'kotoba-speech-jacob': 'KOTOBA-SPEECH-v0.1-JACOB',
     'blane-tts': 'BLANE-TTS',
+    'amitaro-vits': 'AMITARO-VITS',
+    'google-api': 'google-api'
     # 'styletts2': 'StyleTTS 2',
 }
 model_licenses = {
                     print(model_args[model])
                     print(model_kwargs[model])
                     result = router.predict(*model_args[model], **model_kwargs[model])
+                elif model == "google-api":
+                    local_filename = '/tmp/' + str(mkuuid(None)) + '.wav'
+                    result = get_google_tts(text, local_filename=local_filename)
                 else:
                     result = get_tts_file(text, model)
                     # URL to download the file from

utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import json
+import tempfile
+from google.cloud import texttospeech
+def get_credentials():
+    creds_json_str = os.getenv("GCP_CREDENTIAL_JSON")  # get json credentials stored as a string
+    # create a temporary file
+    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp:
+        temp.write(creds_json_str)  # write in json format
+        temp_filename = temp.name
+    return temp_filename
+def get_google_tts(text, local_filename):
+    # Instantiates a client
+    client = texttospeech.TextToSpeechClient()
+    # Set the text input to be synthesized
+    synthesis_input = texttospeech.SynthesisInput(text=text)
+    # Build the voice request, select the language code ("en-US") and the ssml
+    # voice gender ("neutral")
+    voice = texttospeech.VoiceSelectionParams(
+        language_code="ja-JP", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
+    )
+    # Select the type of audio file you want returned
+    audio_config = texttospeech.AudioConfig(
+        audio_encoding=texttospeech.AudioEncoding.MP3
+    )
+    # Perform the text-to-speech request on the text input with the selected
+    # voice parameters and audio file type
+    response = client.synthesize_speech(
+        input=synthesis_input, voice=voice, audio_config=audio_config
+    )
+    # The response's audio_content is binary.
+    with open(local_filename, "wb") as out:
+        # Write the response to the output file.
+        out.write(response.audio_content)
+        print(f'Audio content written to file {local_filename}')
+    return local_filename