BPE / app.py
sagar007's picture
Update app.py
00ea787 verified
raw
history blame
2.9 kB
import gradio as gr
import re
from collections import Counter
def preprocess_text(text):
text = re.sub(r'[^\u0900-\u097F\s]', '', text)
return ' '.join(text.split())
def get_stats(vocab):
pairs = Counter()
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols) - 1):
pairs[symbols[i], symbols[i + 1]] += freq
return pairs
def merge_vocab(pair, v_in):
v_out = {}
bigram = ' '.join(pair)
replacement = ''.join(pair)
for word in v_in:
w_out = word.replace(bigram, replacement)
v_out[w_out] = v_in[word]
return v_out
def apply_bpe(text, bpe_codes):
word_list = text.split()
for pair, _ in bpe_codes:
if ' ' in pair:
p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
word_list = [p.sub(''.join(pair), word) for word in word_list]
return word_list
def bpe_process(input_text, target_vocab_size):
preprocessed_text = preprocess_text(input_text)
# Initialize vocabulary
vocab = Counter(preprocessed_text.split())
vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
# Perform BPE merges
bpe_codes = []
while len(vocab) < target_vocab_size:
pairs = get_stats(vocab)
if not pairs:
break
best = max(pairs, key=pairs.get)
vocab = merge_vocab(best, vocab)
bpe_codes.append((best, pairs[best]))
# Apply BPE to the original text
encoded_text = apply_bpe(preprocessed_text, bpe_codes)
# Calculate compression ratio
original_size = len(preprocessed_text.split())
compressed_size = len(encoded_text)
compression_ratio = original_size / compressed_size
# Check if criteria are met
criteria_met = {
"vocab_size_met": len(vocab) >= 5000,
"compression_ratio_met": compression_ratio >= 3
}
return (
" ".join(encoded_text),
len(vocab),
compression_ratio,
criteria_met
)
# Define the Gradio interface
iface = gr.Interface(
fn=bpe_process,
inputs=[
gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here..."),
gr.Slider(minimum=1000, maximum=10000, step=100, value=6000, label="Target Vocabulary Size")
],
outputs=[
gr.Textbox(label="Encoded Text"),
gr.Number(label="Vocabulary Size"),
gr.Number(label="Compression Ratio"),
gr.JSON(label="Criteria Met")
],
title="Byte Pair Encoding (BPE) for Hindi",
description="Encode Hindi text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio."
)
# Launch the Gradio app
iface.launch(share=True)