Spaces:

mebubo
/

gpted

Sleeping

mebubo commited on Oct 8, 2024

Commit

230a441

1 Parent(s): 426b33e

Snapshot

Files changed (2) hide show

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ def tokenize(input_text: str, tokenizer: Tokenizer, device: torch.device) -> tup
     attention_mask = cast(torch.Tensor, inputs["attention_mask"])
     return input_ids, attention_mask
-def calculate_log_probabilities(model: PreTrainedModel, tokenizer: Tokenizer, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> list[tuple[str, float]]:
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
     # B x T x V
@@ -31,8 +31,8 @@ def calculate_log_probabilities(model: PreTrainedModel, tokenizer: Tokenizer, in
     # T - 1
     token_log_probs: torch.Tensor = log_probs[0, range(log_probs.shape[1]), input_ids[0][1:]]
     # T - 1
-    tokens: list[str] = tokenizer.convert_ids_to_tokens(input_ids[0])[1:]
-    return list(zip(tokens, token_log_probs.tolist()))
 def generate_replacements(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix_tokens: list[int], device: torch.device, num_samples: int = 5) -> list[str]:

     attention_mask = cast(torch.Tensor, inputs["attention_mask"])
     return input_ids, attention_mask
+def calculate_log_probabilities(model: PreTrainedModel, tokenizer: Tokenizer, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> list[tuple[int, float]]:
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
     # B x T x V
     # T - 1
     token_log_probs: torch.Tensor = log_probs[0, range(log_probs.shape[1]), input_ids[0][1:]]
     # T - 1
+    tokens: torch.Tensor = input_ids[0][1:]
+    return list(zip(tokens.tolist(), token_log_probs.tolist()))
 def generate_replacements(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix_tokens: list[int], device: torch.device, num_samples: int = 5) -> list[str]:

text_processing.py CHANGED Viewed

@@ -1,30 +1,32 @@
 from dataclasses import dataclass
 @dataclass
 class Word:
-    tokens: list[str]
     text: str
     logprob: float
     first_token_index: int
-def split_into_words(token_probs: list[tuple[str, float]]) -> list[Word]:
-    words = []
-    current_word = []
-    current_log_probs = []
-    current_word_first_token_index = 0
-    for i, (token, logprob) in enumerate(token_probs):
         if not token.startswith(chr(9601)) and token.isalpha():
-            current_word.append(token)
             current_log_probs.append(logprob)
         else:
             if current_word:
-                words.append(Word(current_word, "".join(current_word), sum(current_log_probs), current_word_first_token_index))
-            current_word = [token]
             current_log_probs = [logprob]
             current_word_first_token_index = i
     if current_word:
-        words.append(Word(current_word, "".join(current_word), sum(current_log_probs), current_word_first_token_index))
     return words

 from dataclasses import dataclass
+from tokenizers import Tokenizer
 @dataclass
 class Word:
+    tokens: list[int]
     text: str
     logprob: float
     first_token_index: int
+def split_into_words(token_probs: list[tuple[int, float]], tokenizer: Tokenizer) -> list[Word]:
+    words: list[Word] = []
+    current_word: list[int] = []
+    current_log_probs: list[float] = []
+    current_word_first_token_index: int = 0
+    for i, (token_id, logprob) in enumerate(token_probs):
+        token: str = tokenizer.decode([token_id])
         if not token.startswith(chr(9601)) and token.isalpha():
+            current_word.append(token_id)
             current_log_probs.append(logprob)
         else:
             if current_word:
+                words.append(Word(current_word, tokenizer.decode(current_word), sum(current_log_probs), current_word_first_token_index))
+            current_word = [token_id]
             current_log_probs = [logprob]
             current_word_first_token_index = i
     if current_word:
+        words.append(Word(current_word, tokenizer.decode(current_word), sum(current_log_probs), current_word_first_token_index))
     return words