Marti Umbert commited on
Commit
777c863
·
1 Parent(s): f87acac

whisperlivekit/audio_processor.py: define and call translate_text, define translation_tokenizer and translator attributes in class AudioProcessor

Browse files
Files changed (1) hide show
  1. whisperlivekit/audio_processor.py +21 -2
whisperlivekit/audio_processor.py CHANGED
@@ -62,6 +62,10 @@ class AudioProcessor:
62
  self.transcription_queue = asyncio.Queue() if self.args.transcription else None
63
  self.diarization_queue = asyncio.Queue() if self.args.diarization else None
64
  self.pcm_buffer = bytearray()
 
 
 
 
65
 
66
  # Initialize transcription engine if enabled
67
  if self.args.transcription:
@@ -368,13 +372,15 @@ class AudioProcessor:
368
  "text": token.text,
369
  "beg": format_time(token.start),
370
  "end": format_time(token.end),
371
- "diff": round(token.end - last_end_diarized, 2)
 
372
  })
373
  previous_speaker = speaker
374
  elif token.text: # Only append if text isn't empty
375
  lines[-1]["text"] += sep + token.text
376
  lines[-1]["end"] = format_time(token.end)
377
  lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
 
378
 
379
  # Handle undiarized text
380
  if undiarized_text:
@@ -508,4 +514,17 @@ class AudioProcessor:
508
  else:
509
  logger.error("Maximum retries reached for FFmpeg process")
510
  await self.restart_ffmpeg()
511
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  self.transcription_queue = asyncio.Queue() if self.args.transcription else None
63
  self.diarization_queue = asyncio.Queue() if self.args.diarization else None
64
  self.pcm_buffer = bytearray()
65
+
66
+ # added for translation from transcription
67
+ self.translation_tokenizer = models.translation_tokenizer
68
+ self.translator = models.translator
69
 
70
  # Initialize transcription engine if enabled
71
  if self.args.transcription:
 
372
  "text": token.text,
373
  "beg": format_time(token.start),
374
  "end": format_time(token.end),
375
+ "diff": round(token.end - last_end_diarized, 2),
376
+ "translation": self.translate_text(text = token.text)
377
  })
378
  previous_speaker = speaker
379
  elif token.text: # Only append if text isn't empty
380
  lines[-1]["text"] += sep + token.text
381
  lines[-1]["end"] = format_time(token.end)
382
  lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
383
+ lines[-1]["translation"] = self.translate_text(text = lines[-1]["text"])
384
 
385
  # Handle undiarized text
386
  if undiarized_text:
 
514
  else:
515
  logger.error("Maximum retries reached for FFmpeg process")
516
  await self.restart_ffmpeg()
517
+ return
518
+
519
+ async def translate_text(self, text: str) -> str:
520
+ """Translate recognized text to the target language."""
521
+ if not hasattr(self, "translation_tokenizer") or not hasattr(self, "translator"):
522
+ logger.warning("Translation model is not loaded. Skipping translation.")
523
+ return text
524
+
525
+ # Tokenize, translate, and detokenize
526
+ tokenized = self.translation_tokenizer.tokenize(text)
527
+ translated = self.translator.translate_batch([tokenized[0]])
528
+ return self.translation_tokenizer.detokenize(translated[0].hypotheses[0])
529
+
530
+