Spaces:

langtech-innovation
/

WhisperLiveKit

Paused

App Files Files Community

Marti Umbert commited on Apr 15

Commit

777c863

1 Parent(s): f87acac

whisperlivekit/audio_processor.py: define and call translate_text, define translation_tokenizer and translator attributes in class AudioProcessor

Browse files

Files changed (1) hide show

whisperlivekit/audio_processor.py +21 -2

whisperlivekit/audio_processor.py CHANGED Viewed

@@ -62,6 +62,10 @@ class AudioProcessor:
         self.transcription_queue = asyncio.Queue() if self.args.transcription else None
         self.diarization_queue = asyncio.Queue() if self.args.diarization else None
         self.pcm_buffer = bytearray()
         # Initialize transcription engine if enabled
         if self.args.transcription:
@@ -368,13 +372,15 @@ class AudioProcessor:
                             "text": token.text,
                             "beg": format_time(token.start),
                             "end": format_time(token.end),
-                            "diff": round(token.end - last_end_diarized, 2)
                         })
                         previous_speaker = speaker
                     elif token.text:  # Only append if text isn't empty
                         lines[-1]["text"] += sep + token.text
                         lines[-1]["end"] = format_time(token.end)
                         lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
                 # Handle undiarized text
                 if undiarized_text:
@@ -508,4 +514,17 @@ class AudioProcessor:
                 else:
                     logger.error("Maximum retries reached for FFmpeg process")
                     await self.restart_ffmpeg()
-                    return

         self.transcription_queue = asyncio.Queue() if self.args.transcription else None
         self.diarization_queue = asyncio.Queue() if self.args.diarization else None
         self.pcm_buffer = bytearray()
+        # added for translation from transcription
+        self.translation_tokenizer = models.translation_tokenizer
+        self.translator = models.translator
         # Initialize transcription engine if enabled
         if self.args.transcription:
                             "text": token.text,
                             "beg": format_time(token.start),
                             "end": format_time(token.end),
+                            "diff": round(token.end - last_end_diarized, 2),
+                            "translation": self.translate_text(text = token.text)
                         })
                         previous_speaker = speaker
                     elif token.text:  # Only append if text isn't empty
                         lines[-1]["text"] += sep + token.text
                         lines[-1]["end"] = format_time(token.end)
                         lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
+                        lines[-1]["translation"] = self.translate_text(text = lines[-1]["text"])
                 # Handle undiarized text
                 if undiarized_text:
                 else:
                     logger.error("Maximum retries reached for FFmpeg process")
                     await self.restart_ffmpeg()
+                    return
+    async def translate_text(self, text: str) -> str:
+        """Translate recognized text to the target language."""
+        if not hasattr(self, "translation_tokenizer") or not hasattr(self, "translator"):
+            logger.warning("Translation model is not loaded. Skipping translation.")
+            return text
+        # Tokenize, translate, and detokenize
+        tokenized = self.translation_tokenizer.tokenize(text)
+        translated = self.translator.translate_batch([tokenized[0]])
+        return self.translation_tokenizer.detokenize(translated[0].hypotheses[0])