Update app.py
Browse files
app.py
CHANGED
|
@@ -22,29 +22,27 @@ def merge_vocab(pair, v_in):
|
|
| 22 |
bigram = ' '.join(pair)
|
| 23 |
replacement = ''.join(pair)
|
| 24 |
for word in v_in:
|
| 25 |
-
|
|
|
|
| 26 |
v_out[w_out] = v_in[word]
|
| 27 |
return v_out
|
| 28 |
|
| 29 |
def apply_bpe(text, bpe_codes):
|
| 30 |
word_list = text.split()
|
| 31 |
for pair, _ in bpe_codes:
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
word_list = [p.sub(''.join(pair), word) for word in word_list]
|
| 35 |
return ' '.join(word_list)
|
| 36 |
|
| 37 |
def bpe_process(input_text, target_vocab_size):
|
| 38 |
preprocessed_text = preprocess_text(input_text)
|
| 39 |
|
| 40 |
-
# Initialize vocabulary
|
| 41 |
vocab = Counter(preprocessed_text.split())
|
| 42 |
-
vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
|
| 43 |
-
vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
|
| 44 |
|
| 45 |
# Perform BPE merges
|
| 46 |
bpe_codes = []
|
| 47 |
-
while len(vocab) < target_vocab_size:
|
| 48 |
pairs = get_stats(vocab)
|
| 49 |
if not pairs:
|
| 50 |
break
|
|
@@ -56,8 +54,8 @@ def bpe_process(input_text, target_vocab_size):
|
|
| 56 |
encoded_text = apply_bpe(preprocessed_text, bpe_codes)
|
| 57 |
|
| 58 |
# Calculate compression ratio
|
| 59 |
-
original_size = len(preprocessed_text)
|
| 60 |
-
compressed_size = len(encoded_text)
|
| 61 |
compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
|
| 62 |
|
| 63 |
# Check if criteria are met
|
|
|
|
| 22 |
bigram = ' '.join(pair)
|
| 23 |
replacement = ''.join(pair)
|
| 24 |
for word in v_in:
|
| 25 |
+
# Use regex to ensure whole-word replacement
|
| 26 |
+
w_out = re.sub(r'(?<!\S)' + re.escape(bigram) + r'(?!\S)', replacement, word)
|
| 27 |
v_out[w_out] = v_in[word]
|
| 28 |
return v_out
|
| 29 |
|
| 30 |
def apply_bpe(text, bpe_codes):
|
| 31 |
word_list = text.split()
|
| 32 |
for pair, _ in bpe_codes:
|
| 33 |
+
p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
|
| 34 |
+
word_list = [p.sub(''.join(pair), word) for word in word_list]
|
|
|
|
| 35 |
return ' '.join(word_list)
|
| 36 |
|
| 37 |
def bpe_process(input_text, target_vocab_size):
|
| 38 |
preprocessed_text = preprocess_text(input_text)
|
| 39 |
|
| 40 |
+
# Initialize vocabulary
|
| 41 |
vocab = Counter(preprocessed_text.split())
|
|
|
|
|
|
|
| 42 |
|
| 43 |
# Perform BPE merges
|
| 44 |
bpe_codes = []
|
| 45 |
+
while len(vocab) < target_vocab_size and len(vocab) > 1:
|
| 46 |
pairs = get_stats(vocab)
|
| 47 |
if not pairs:
|
| 48 |
break
|
|
|
|
| 54 |
encoded_text = apply_bpe(preprocessed_text, bpe_codes)
|
| 55 |
|
| 56 |
# Calculate compression ratio
|
| 57 |
+
original_size = len(preprocessed_text.split())
|
| 58 |
+
compressed_size = len(encoded_text.split())
|
| 59 |
compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
|
| 60 |
|
| 61 |
# Check if criteria are met
|