lihaoxin2020 commited on
Commit
8e8c0ba
·
1 Parent(s): b4e812c

kotoba-speech debug

Browse files
Files changed (1) hide show
  1. app.py +73 -15
app.py CHANGED
@@ -45,7 +45,8 @@ AVAILABLE_MODELS = {
45
  # 'VoiceCraft 2.0': 'voicecraft',
46
  # 'Parler TTS': 'parler'
47
  'MOE': 'moe',
48
- 'BARK': 'bark'
 
49
  }
50
 
51
  SPACE_ID = os.getenv('SPACE_ID')
@@ -105,10 +106,62 @@ def create_db_if_missing():
105
  timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
106
  );
107
  ''')
 
108
  def get_db():
109
  return sqlite3.connect(DB_PATH)
110
 
111
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  ####################################
114
  # Space initialization
@@ -128,7 +181,6 @@ if not os.path.isfile(DB_PATH):
128
  create_db_if_missing()
129
 
130
  # Sync local DB with remote repo every 5 minute (only if a change is detected)
131
- print("[debug]", DB_DATASET_ID)
132
  scheduler = CommitScheduler(
133
  repo_id=DB_DATASET_ID,
134
  repo_type="dataset",
@@ -278,7 +330,8 @@ model_names = {
278
  # 'metavoice': 'MetaVoice-1B',
279
  'bark': 'BARK',
280
  'moe': 'MOE',
281
- 'styletts2': 'StyleTTS 2',
 
282
  }
283
  model_licenses = {
284
  'styletts2': 'MIT',
@@ -328,6 +381,7 @@ model_links = {
328
  # 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
329
  'bark': 'https://suno-bark.hf.space/',
330
  'moe': 'skytnt/moe-tts',
 
331
  }
332
  model_kwargs = {
333
  'moe': {
@@ -335,7 +389,10 @@ model_kwargs = {
335
  },
336
  'bark': {
337
  'fn_index': 3
338
- },
 
 
 
339
  }
340
  # def get_random_split(existing_split=None):
341
  # choice = random.choice(list(audio_dataset.keys()))
@@ -585,14 +642,12 @@ def synthandreturn(text):
585
  raise gr.Error(f'You did not enter any text')
586
  # Check language
587
  try:
588
- if not detect(text) == "en":
589
- print(text)
590
- gr.Warning('Warning: The input text may not be in English')
591
  except:
592
  pass
593
  # Get two random models
594
  mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
595
- # mdl1, mdl2 = "moe1", "moe2"
596
  log_text(text)
597
  print("[debug] Using", mdl1, mdl2)
598
  def predict_and_update_result(text, model, result_storage):
@@ -609,12 +664,15 @@ def synthandreturn(text):
609
  ),
610
  }
611
  # result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
612
- router = Client(model_links[model])
613
- # debug
614
- print(model_args[model])
615
- print(model_kwargs[model])
616
-
617
- result = router.predict(*model_args[model], **model_kwargs[model])
 
 
 
618
  else:
619
  # result = router.predict(text, model.lower(), api_name="/synthesize")
620
  # result = router.predict(
 
45
  # 'VoiceCraft 2.0': 'voicecraft',
46
  # 'Parler TTS': 'parler'
47
  'MOE': 'moe',
48
+ # 'BARK': 'bark',
49
+ 'KOTOBA-SPEECH': 'kotoba-speech'
50
  }
51
 
52
  SPACE_ID = os.getenv('SPACE_ID')
 
106
  timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
107
  );
108
  ''')
109
+
110
  def get_db():
111
  return sqlite3.connect(DB_PATH)
112
 
113
+ def kotoba_speech_tts(text):
114
+ url = "https://kotoba-tech-kotoba-speech.hf.space/call/tts"
115
+ headers = {
116
+ "Content-Type": "application/json"
117
+ }
118
+ data = {
119
+ "data": [
120
+ text,
121
+ 5,
122
+ 5,
123
+ "Preset voices",
124
+ "Ava",
125
+ {"path": "https://keikaku-hoso.com/sample_voice/voice01_A.mp3"},
126
+ {"path": "https://keikaku-hoso.com/sample_voice/voice01_A.mp3"}
127
+ ]
128
+ }
129
+
130
+ # Send POST request
131
+ response = requests.post(url, headers=headers, data=json.dumps(data))
132
+ response.raise_for_status() # Raise an error for bad status codes
133
+
134
+ # Print the response to inspect its structure
135
+ print("Response JSON:", response.json())
136
+
137
+ # Extract EVENT_ID from the response
138
+ response_json = response.json()
139
+ # if 'data' in response_json and isinstance(response_json['data'], list) and len(response_json['data']) > 0:
140
+ # event_id = response_json['data'][0]
141
+ # else:
142
+ # raise KeyError("The key 'data' is not present or does not contain the expected format in the response")
143
+ event_id = response_json['event_id']
144
+
145
+ # Send GET request to the next URL
146
+ stream_url = f"https://kotoba-tech-kotoba-speech.hf.space/call/tts/{event_id}"
147
+ stream_response = requests.get(stream_url, stream=True)
148
+ stream_response.raise_for_status() # Raise an error for bad status codes
149
+
150
+ # Process the streamed response
151
+ for line in stream_response.iter_lines():
152
+ if line:
153
+ decoded_line = line.decode('utf-8')
154
+ print(decoded_line)
155
+ # try:
156
+ # line_json = json.loads('{' + decoded_line + '}')
157
+ # if 'data' in line_json:
158
+ # print("Data from stream:", line_json['data'])
159
+ # except json.JSONDecodeError as e:
160
+ # print(f"Could not decode line as JSON: {decoded_line}")
161
+ # print(f"Error: {e}")
162
+ parsed_dir = json.loads(decoded_line[6:])[0]['path']
163
+ print(parsed_dir)
164
+ return parsed_dir
165
 
166
  ####################################
167
  # Space initialization
 
181
  create_db_if_missing()
182
 
183
  # Sync local DB with remote repo every 5 minute (only if a change is detected)
 
184
  scheduler = CommitScheduler(
185
  repo_id=DB_DATASET_ID,
186
  repo_type="dataset",
 
330
  # 'metavoice': 'MetaVoice-1B',
331
  'bark': 'BARK',
332
  'moe': 'MOE',
333
+ 'kotoba-speech': 'KOTOBA-SPEECH'
334
+ # 'styletts2': 'StyleTTS 2',
335
  }
336
  model_licenses = {
337
  'styletts2': 'MIT',
 
381
  # 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
382
  'bark': 'https://suno-bark.hf.space/',
383
  'moe': 'skytnt/moe-tts',
384
+ 'kotoba-speech': 'kotoba-tech/kotoba-speech'
385
  }
386
  model_kwargs = {
387
  'moe': {
 
389
  },
390
  'bark': {
391
  'fn_index': 3
392
+ },
393
+ 'kotoba-speech': {
394
+ 'api_name': '/tts'
395
+ }
396
  }
397
  # def get_random_split(existing_split=None):
398
  # choice = random.choice(list(audio_dataset.keys()))
 
642
  raise gr.Error(f'You did not enter any text')
643
  # Check language
644
  try:
645
+ if not detect(text) == "ja":
646
+ gr.Warning('Warning: The input text may not be in Japanese')
 
647
  except:
648
  pass
649
  # Get two random models
650
  mdl1, mdl2 = random.sample(list(AVAILABLE_MODELS.keys()), 2)
 
651
  log_text(text)
652
  print("[debug] Using", mdl1, mdl2)
653
  def predict_and_update_result(text, model, result_storage):
 
664
  ),
665
  }
666
  # result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
667
+ if model != "kotoba-speech":
668
+ router = Client(model_links[model])
669
+ # debug
670
+ print(model_args[model])
671
+ print(model_kwargs[model])
672
+
673
+ result = router.predict(*model_args[model], **model_kwargs[model])
674
+ else:
675
+ result = kotoba_speech_tts(text)
676
  else:
677
  # result = router.predict(text, model.lower(), api_name="/synthesize")
678
  # result = router.predict(