sagar007 commited on
Commit
20f38a4
·
verified ·
1 Parent(s): a1bcbda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -10
app.py CHANGED
@@ -22,29 +22,27 @@ def merge_vocab(pair, v_in):
22
  bigram = ' '.join(pair)
23
  replacement = ''.join(pair)
24
  for word in v_in:
25
- w_out = word.replace(bigram, replacement)
 
26
  v_out[w_out] = v_in[word]
27
  return v_out
28
 
29
  def apply_bpe(text, bpe_codes):
30
  word_list = text.split()
31
  for pair, _ in bpe_codes:
32
- if ' ' in pair:
33
- p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
34
- word_list = [p.sub(''.join(pair), word) for word in word_list]
35
  return ' '.join(word_list)
36
 
37
  def bpe_process(input_text, target_vocab_size):
38
  preprocessed_text = preprocess_text(input_text)
39
 
40
- # Initialize vocabulary with character-level tokens and common subwords
41
  vocab = Counter(preprocessed_text.split())
42
- vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
43
- vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
44
 
45
  # Perform BPE merges
46
  bpe_codes = []
47
- while len(vocab) < target_vocab_size:
48
  pairs = get_stats(vocab)
49
  if not pairs:
50
  break
@@ -56,8 +54,8 @@ def bpe_process(input_text, target_vocab_size):
56
  encoded_text = apply_bpe(preprocessed_text, bpe_codes)
57
 
58
  # Calculate compression ratio
59
- original_size = len(preprocessed_text)
60
- compressed_size = len(encoded_text)
61
  compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
62
 
63
  # Check if criteria are met
 
22
  bigram = ' '.join(pair)
23
  replacement = ''.join(pair)
24
  for word in v_in:
25
+ # Use regex to ensure whole-word replacement
26
+ w_out = re.sub(r'(?<!\S)' + re.escape(bigram) + r'(?!\S)', replacement, word)
27
  v_out[w_out] = v_in[word]
28
  return v_out
29
 
30
  def apply_bpe(text, bpe_codes):
31
  word_list = text.split()
32
  for pair, _ in bpe_codes:
33
+ p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
34
+ word_list = [p.sub(''.join(pair), word) for word in word_list]
 
35
  return ' '.join(word_list)
36
 
37
  def bpe_process(input_text, target_vocab_size):
38
  preprocessed_text = preprocess_text(input_text)
39
 
40
+ # Initialize vocabulary
41
  vocab = Counter(preprocessed_text.split())
 
 
42
 
43
  # Perform BPE merges
44
  bpe_codes = []
45
+ while len(vocab) < target_vocab_size and len(vocab) > 1:
46
  pairs = get_stats(vocab)
47
  if not pairs:
48
  break
 
54
  encoded_text = apply_bpe(preprocessed_text, bpe_codes)
55
 
56
  # Calculate compression ratio
57
+ original_size = len(preprocessed_text.split())
58
+ compressed_size = len(encoded_text.split())
59
  compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
60
 
61
  # Check if criteria are met