SilasKieser commited on
Commit
937fc50
·
1 Parent(s): 40c4763

use moses sentence segmenter instead of tokenizer

Browse files
src/whisper_streaming/online_asr.py CHANGED
@@ -87,11 +87,20 @@ class OnlineASRProcessor:
87
  buffer_trimming=("segment", 15),
88
  logfile=sys.stderr,
89
  ):
90
- """asr: WhisperASR object
91
- tokenize_method: sentence tokenizer function for the target language. Must be a callable and behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
92
- ("segment", 15)
93
- buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option.
94
- logfile: where to store the log.
 
 
 
 
 
 
 
 
 
95
  """
96
  self.asr = asr
97
  self.tokenize = tokenize_method
@@ -194,24 +203,25 @@ class OnlineASRProcessor:
194
  def chunk_completed_sentence(self):
195
  if self.commited == []:
196
  return
197
-
198
- raw_text = self.asr.sep.join([s[2] for s in self.commited])
199
- logger.debug(f"[Sentence-segmentation] Raw Text: {raw_text}")
200
 
201
  sents = self.words_to_sentences(self.commited)
202
 
203
 
204
 
205
- for s in sents:
206
- logger.debug(f"[Sentence-segmentation] completed sentence: {s}")
207
  if len(sents) < 2:
 
208
  return
209
- while len(sents) > 2:
210
- sents.pop(0)
 
 
 
 
 
211
  # we will continue with audio processing at this timestamp
212
  chunk_at = sents[-2][1]
213
 
214
- logger.debug(f"[Sentence-segmentation]: sentence chunked at {chunk_at:2.2f}")
215
  self.chunk_at(chunk_at)
216
 
217
  def chunk_completed_segment(self, res):
@@ -249,8 +259,9 @@ class OnlineASRProcessor:
249
  """
250
 
251
  cwords = [w for w in words]
252
- t = " ".join(o[2] for o in cwords)
253
- s = self.tokenize(t)
 
254
  out = []
255
  while s:
256
  beg = None
 
87
  buffer_trimming=("segment", 15),
88
  logfile=sys.stderr,
89
  ):
90
+ """
91
+ Initialize OnlineASRProcessor.
92
+
93
+ Args:
94
+ asr: WhisperASR object
95
+ tokenize_method: Sentence tokenizer function for the target language.
96
+ Must be a function that takes a list of text as input like MosesSentenceSplitter.
97
+ Can be None if using "segment" buffer trimming option.
98
+ buffer_trimming: Tuple of (option, seconds) where:
99
+ - option: Either "sentence" or "segment"
100
+ - seconds: Number of seconds threshold for buffer trimming
101
+ Default is ("segment", 15)
102
+ logfile: File to store logs
103
+
104
  """
105
  self.asr = asr
106
  self.tokenize = tokenize_method
 
203
  def chunk_completed_sentence(self):
204
  if self.commited == []:
205
  return
 
 
 
206
 
207
  sents = self.words_to_sentences(self.commited)
208
 
209
 
210
 
 
 
211
  if len(sents) < 2:
212
+ logger.debug(f"[Sentence-segmentation] no sentence segmented.")
213
  return
214
+
215
+
216
+
217
+ identified_sentence= "\n - ".join([f"{s[0]*1000:.0f}-{s[1]*1000:.0f} {s[2]}" for s in sents])
218
+ logger.debug(f"[Sentence-segmentation] identified sentences:\n - {identified_sentence}")
219
+
220
+
221
  # we will continue with audio processing at this timestamp
222
  chunk_at = sents[-2][1]
223
 
224
+ logger.debug(f"[Sentence-segmentation]: sentence will be chunked at {chunk_at:2.2f}")
225
  self.chunk_at(chunk_at)
226
 
227
  def chunk_completed_segment(self, res):
 
259
  """
260
 
261
  cwords = [w for w in words]
262
+ t = self.asr.sep.join(o[2] for o in cwords)
263
+ logger.debug(f"[Sentence-segmentation] Raw Text: {t}")
264
+ s = self.tokenize([t])
265
  out = []
266
  while s:
267
  beg = None
whisper_online.py CHANGED
@@ -49,16 +49,16 @@ def create_tokenizer(lan):
49
  lan
50
  in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split()
51
  ):
52
- from mosestokenizer import MosesTokenizer
53
 
54
- return MosesTokenizer(lan)
55
 
56
  # the following languages are in Whisper, but not in wtpsplit:
57
  if (
58
  lan
59
  in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split()
60
  ):
61
- logger.warning(
62
  f"{lan} code is not supported by wtpsplit. Going to use None lang_code option."
63
  )
64
  lan = None
@@ -204,6 +204,7 @@ def backend_factory(args):
204
 
205
  # Create the tokenizer
206
  if args.buffer_trimming == "sentence":
 
207
  tokenizer = create_tokenizer(tgt_language)
208
  else:
209
  tokenizer = None
 
49
  lan
50
  in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split()
51
  ):
52
+ from mosestokenizer import MosesSentenceSplitter
53
 
54
+ return MosesSentenceSplitter(lan)
55
 
56
  # the following languages are in Whisper, but not in wtpsplit:
57
  if (
58
  lan
59
  in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split()
60
  ):
61
+ logger.debug(
62
  f"{lan} code is not supported by wtpsplit. Going to use None lang_code option."
63
  )
64
  lan = None
 
204
 
205
  # Create the tokenizer
206
  if args.buffer_trimming == "sentence":
207
+
208
  tokenizer = create_tokenizer(tgt_language)
209
  else:
210
  tokenizer = None