RangiLyu commited on
Commit
7134699
·
verified ·
1 Parent(s): 5d5ba41

better fix out of vocab tokens (#6)

Browse files

- better fix out of vocab tokens (a0a7085477e391fd28ba330c12379c5ebe955c09)

Files changed (1) hide show
  1. tokenization_interns1.py +5 -3
tokenization_interns1.py CHANGED
@@ -891,11 +891,13 @@ class InternS1Tokenizer(Qwen2Tokenizer):
891
  else:
892
  return self.encoder.get(token, self.encoder.get(self._unk_token))
893
 
 
 
 
 
894
  def convert_tokens_to_string(self, tokens):
895
  """Converts a sequence of tokens (string) in a single string."""
896
- text = ""
897
- for token in tokens:
898
- text += token if token else ""
899
  text = text.replace(
900
  "▁", "Ġ"
901
  ) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.
 
891
  else:
892
  return self.encoder.get(token, self.encoder.get(self._unk_token))
893
 
894
+ def _convert_id_to_token(self, index):
895
+ """Converts an index (integer) in a token (str) using the vocab."""
896
+ return self.decoder.get(index, "")
897
+
898
  def convert_tokens_to_string(self, tokens):
899
  """Converts a sequence of tokens (string) in a single string."""
900
+ text = "".join(tokens)
 
 
901
  text = text.replace(
902
  "▁", "Ġ"
903
  ) # This discrepancy stems from differing whitespace treatment in SentencePiece versus BPE tokenization.