Marti Umbert
commited on
Commit
·
777c863
1
Parent(s):
f87acac
whisperlivekit/audio_processor.py: define and call translate_text, define translation_tokenizer and translator attributes in class AudioProcessor
Browse files
whisperlivekit/audio_processor.py
CHANGED
|
@@ -62,6 +62,10 @@ class AudioProcessor:
|
|
| 62 |
self.transcription_queue = asyncio.Queue() if self.args.transcription else None
|
| 63 |
self.diarization_queue = asyncio.Queue() if self.args.diarization else None
|
| 64 |
self.pcm_buffer = bytearray()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
# Initialize transcription engine if enabled
|
| 67 |
if self.args.transcription:
|
|
@@ -368,13 +372,15 @@ class AudioProcessor:
|
|
| 368 |
"text": token.text,
|
| 369 |
"beg": format_time(token.start),
|
| 370 |
"end": format_time(token.end),
|
| 371 |
-
"diff": round(token.end - last_end_diarized, 2)
|
|
|
|
| 372 |
})
|
| 373 |
previous_speaker = speaker
|
| 374 |
elif token.text: # Only append if text isn't empty
|
| 375 |
lines[-1]["text"] += sep + token.text
|
| 376 |
lines[-1]["end"] = format_time(token.end)
|
| 377 |
lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
|
|
|
|
| 378 |
|
| 379 |
# Handle undiarized text
|
| 380 |
if undiarized_text:
|
|
@@ -508,4 +514,17 @@ class AudioProcessor:
|
|
| 508 |
else:
|
| 509 |
logger.error("Maximum retries reached for FFmpeg process")
|
| 510 |
await self.restart_ffmpeg()
|
| 511 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
self.transcription_queue = asyncio.Queue() if self.args.transcription else None
|
| 63 |
self.diarization_queue = asyncio.Queue() if self.args.diarization else None
|
| 64 |
self.pcm_buffer = bytearray()
|
| 65 |
+
|
| 66 |
+
# added for translation from transcription
|
| 67 |
+
self.translation_tokenizer = models.translation_tokenizer
|
| 68 |
+
self.translator = models.translator
|
| 69 |
|
| 70 |
# Initialize transcription engine if enabled
|
| 71 |
if self.args.transcription:
|
|
|
|
| 372 |
"text": token.text,
|
| 373 |
"beg": format_time(token.start),
|
| 374 |
"end": format_time(token.end),
|
| 375 |
+
"diff": round(token.end - last_end_diarized, 2),
|
| 376 |
+
"translation": self.translate_text(text = token.text)
|
| 377 |
})
|
| 378 |
previous_speaker = speaker
|
| 379 |
elif token.text: # Only append if text isn't empty
|
| 380 |
lines[-1]["text"] += sep + token.text
|
| 381 |
lines[-1]["end"] = format_time(token.end)
|
| 382 |
lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
|
| 383 |
+
lines[-1]["translation"] = self.translate_text(text = lines[-1]["text"])
|
| 384 |
|
| 385 |
# Handle undiarized text
|
| 386 |
if undiarized_text:
|
|
|
|
| 514 |
else:
|
| 515 |
logger.error("Maximum retries reached for FFmpeg process")
|
| 516 |
await self.restart_ffmpeg()
|
| 517 |
+
return
|
| 518 |
+
|
| 519 |
+
async def translate_text(self, text: str) -> str:
|
| 520 |
+
"""Translate recognized text to the target language."""
|
| 521 |
+
if not hasattr(self, "translation_tokenizer") or not hasattr(self, "translator"):
|
| 522 |
+
logger.warning("Translation model is not loaded. Skipping translation.")
|
| 523 |
+
return text
|
| 524 |
+
|
| 525 |
+
# Tokenize, translate, and detokenize
|
| 526 |
+
tokenized = self.translation_tokenizer.tokenize(text)
|
| 527 |
+
translated = self.translator.translate_batch([tokenized[0]])
|
| 528 |
+
return self.translation_tokenizer.detokenize(translated[0].hypotheses[0])
|
| 529 |
+
|
| 530 |
+
|