Normalize Polish text to NFC before tokenization
Browse files
src/chatterbox/models/tokenizers/tokenizer.py
CHANGED
|
@@ -306,6 +306,10 @@ class MTLTokenizer:
|
|
| 306 |
txt = korean_normalize(txt)
|
| 307 |
elif language_id == 'ru':
|
| 308 |
txt = self.russian_stress_labeler(txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
# Prepend language token
|
| 311 |
if language_id:
|
|
|
|
| 306 |
txt = korean_normalize(txt)
|
| 307 |
elif language_id == 'ru':
|
| 308 |
txt = self.russian_stress_labeler(txt)
|
| 309 |
+
elif language_id == 'pl':
|
| 310 |
+
# Polish text normalization: ensure diacritic characters are preserved
|
| 311 |
+
import unicodedata
|
| 312 |
+
txt = unicodedata.normalize('NFC', txt)
|
| 313 |
|
| 314 |
# Prepend language token
|
| 315 |
if language_id:
|