|
|
import gradio as gr |
|
|
import re |
|
|
from collections import Counter |
|
|
|
|
|
def preprocess_text(text): |
|
|
text = re.sub(r'[^\u0900-\u097F\s]', '', text) |
|
|
return ' '.join(text.split()) |
|
|
|
|
|
def get_stats(vocab): |
|
|
pairs = Counter() |
|
|
for word, freq in vocab.items(): |
|
|
symbols = word.split() |
|
|
for i in range(len(symbols) - 1): |
|
|
pairs[symbols[i], symbols[i + 1]] += freq |
|
|
return pairs |
|
|
|
|
|
def merge_vocab(pair, v_in): |
|
|
v_out = {} |
|
|
bigram = ' '.join(pair) |
|
|
replacement = ''.join(pair) |
|
|
for word in v_in: |
|
|
w_out = word.replace(bigram, replacement) |
|
|
v_out[w_out] = v_in[word] |
|
|
return v_out |
|
|
|
|
|
def apply_bpe(text, bpe_codes): |
|
|
word_list = text.split() |
|
|
for pair, _ in bpe_codes: |
|
|
if ' ' in pair: |
|
|
p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)') |
|
|
word_list = [p.sub(''.join(pair), word) for word in word_list] |
|
|
return word_list |
|
|
|
|
|
def bpe_process(input_text, target_vocab_size): |
|
|
preprocessed_text = preprocess_text(input_text) |
|
|
|
|
|
|
|
|
vocab = Counter(preprocessed_text.split()) |
|
|
vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)])) |
|
|
vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)])) |
|
|
|
|
|
|
|
|
bpe_codes = [] |
|
|
while len(vocab) < target_vocab_size: |
|
|
pairs = get_stats(vocab) |
|
|
if not pairs: |
|
|
break |
|
|
best = max(pairs, key=pairs.get) |
|
|
vocab = merge_vocab(best, vocab) |
|
|
bpe_codes.append((best, pairs[best])) |
|
|
|
|
|
|
|
|
encoded_text = apply_bpe(preprocessed_text, bpe_codes) |
|
|
|
|
|
|
|
|
original_size = len(preprocessed_text.split()) |
|
|
compressed_size = len(encoded_text) |
|
|
compression_ratio = original_size / compressed_size |
|
|
|
|
|
|
|
|
criteria_met = { |
|
|
"vocab_size_met": len(vocab) >= 5000, |
|
|
"compression_ratio_met": compression_ratio >= 3 |
|
|
} |
|
|
|
|
|
return ( |
|
|
" ".join(encoded_text), |
|
|
len(vocab), |
|
|
compression_ratio, |
|
|
criteria_met |
|
|
) |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=bpe_process, |
|
|
inputs=[ |
|
|
gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here..."), |
|
|
gr.Slider(minimum=1000, maximum=10000, step=100, value=6000, label="Target Vocabulary Size") |
|
|
], |
|
|
outputs=[ |
|
|
gr.Textbox(label="Encoded Text"), |
|
|
gr.Number(label="Vocabulary Size"), |
|
|
gr.Number(label="Compression Ratio"), |
|
|
gr.JSON(label="Criteria Met") |
|
|
], |
|
|
title="Byte Pair Encoding (BPE) for Hindi", |
|
|
description="Encode Hindi text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio." |
|
|
) |
|
|
|
|
|
|
|
|
iface.launch(share=True) |