BPE / app.py
sagar007's picture
Update app.py
f73fa89 verified
import re
from collections import Counter
import gradio as gr
def preprocess_text(text):
text = re.sub(r'[^\u0900-\u097F\s]', '', text)
text = ' '.join(text.split())
return text
def get_stats(vocab):
pairs = Counter()
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols) - 1):
pairs[symbols[i], symbols[i + 1]] += freq
return pairs
def merge_vocab(pair, v_in):
v_out = {}
bigram = ' '.join(pair)
replacement = ''.join(pair)
for word in v_in:
w_out = word.replace(bigram, replacement)
v_out[w_out] = v_in[word]
return v_out
def apply_bpe(text, bpe_codes):
word_list = text.split()
for pair, _ in bpe_codes:
if ' ' in pair:
p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
word_list = [p.sub(''.join(pair), word) for word in word_list]
return word_list
def perform_bpe(text):
preprocessed_text = preprocess_text(text)
original_size = len(preprocessed_text)
vocab = Counter(preprocessed_text.split())
vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
bpe_codes = []
while True:
pairs = get_stats(vocab)
if not pairs:
break
best = max(pairs, key=pairs.get)
vocab = merge_vocab(best, vocab)
bpe_codes.append((best, pairs[best]))
encoded_text = apply_bpe(preprocessed_text, bpe_codes)
compressed_size = len(encoded_text)
compression_ratio = original_size / compressed_size if compressed_size > 0 else 0
if len(vocab) >= 5000 and compression_ratio >= 3:
break
result = f"Vocabulary size: {len(vocab)}\n"
result += f"Original size: {original_size}\n"
result += f"Compressed size: {compressed_size}\n"
result += f"Compression ratio: {compression_ratio:.2f}X\n\n"
if len(vocab) >= 5000 and compression_ratio >= 3:
result += "Both criteria are met!"
elif len(vocab) >= 5000:
result += "Vocabulary size criterion is met, but compression ratio is below 3."
elif compression_ratio >= 3:
result += "Compression ratio criterion is met, but vocabulary size is below 5000."
else:
result += "Neither criterion is met."
return result, ' '.join(encoded_text)
def bpe_app(input_text):
if not input_text:
input_text = "नमस्ते! यह एक उदाहरण हिंदी वाक्य है। आप अपना खुद का पाठ यहां दर्ज कर सकते हैं।"
stats, encoded_text = perform_bpe(input_text)
return stats, encoded_text
# Custom CSS
custom_css = """
<style>
body {
font-family: 'Poppins', sans-serif;
background: linear-gradient(135deg, #1e3c72, #2a5298);
color: #ffffff;
}
.container {
max-width: 900px;
margin: 0 auto;
padding: 20px;
}
.gradio-container {
background-color: rgba(255, 255, 255, 0.1);
border-radius: 15px;
backdrop-filter: blur(10px);
box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
}
h1 {
color: #ffffff;
text-align: center;
font-size: 2.5em;
margin-bottom: 20px;
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
}
.gr-button {
background-color: #4CAF50 !important;
border: none !important;
color: white !important;
text-transform: uppercase;
font-weight: bold;
transition: all 0.3s ease;
}
.gr-button:hover {
background-color: #45a049 !important;
box-shadow: 0 5px 15px rgba(0,0,0,0.2);
transform: translateY(-2px);
}
.gr-form {
border-radius: 10px;
background-color: rgba(255, 255, 255, 0.1);
padding: 20px;
margin-bottom: 20px;
}
.gr-box {
border-radius: 8px;
border: 1px solid rgba(255, 255, 255, 0.2);
padding: 15px;
margin-top: 10px;
background-color: rgba(255, 255, 255, 0.05);
}
.gr-padded {
padding: 15px;
}
.gr-input, .gr-textarea {
background-color: rgba(255, 255, 255, 0.1) !important;
border: 2px solid rgba(255, 255, 255, 0.2) !important;
border-radius: 8px !important;
color: white !important;
}
.gr-input:focus, .gr-textarea:focus {
border-color: #4CAF50 !important;
box-shadow: 0 0 0 2px rgba(76, 175, 80, 0.2) !important;
}
#component-0 {
border-radius: 10px;
overflow: hidden;
}
.footer {
text-align: center;
margin-top: 20px;
font-size: 0.9em;
color: rgba(255, 255, 255, 0.7);
}
</style>
"""
# HTML Template
html_template = """
<div class="container">
<h1>🇮🇳 Byte Pair Encoding for Hindi</h1>
<p style="text-align: center; margin-bottom: 20px; color: rgba(255, 255, 255, 0.8);">
Compress and tokenize Hindi text using the BPE algorithm.
Enter your text below or use one of the examples provided.
</p>
</div>
"""
# Create Gradio interface with custom theme
with gr.Blocks(css=custom_css) as iface:
gr.HTML(html_template)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(lines=5, label="Input Hindi Text", placeholder="Enter Hindi text here or leave blank for an example")
with gr.Row():
submit_btn = gr.Button("Process", variant="primary")
with gr.Row():
with gr.Column():
output_stats = gr.Textbox(label="BPE Statistics")
with gr.Column():
output_encoded = gr.Textbox(label="Encoded Text")
gr.Markdown("The algorithm continues until it reaches a vocabulary size of 5000+ tokens and a compression ratio of 3 or above.")
examples = gr.Examples(
examples=[
["नमस्ते दुनिया! यह एक छोटा सा उदाहरण है।"],
["भारत एक विशाल और विविधतापूर्ण देश है, जहाँ कई भाषाएँ बोली जाती हैं।"],
["आज का मौसम बहुत सुहावना है। आकाश में बादल छाए हुए हैं और हल्की बारिश हो रही है।"]
],
inputs=[input_text],
)
submit_btn.click(bpe_app, inputs=[input_text], outputs=[output_stats, output_encoded])
gr.HTML('<div class="footer">Developed with ❤️ for Hindi language processing</div>')
# Launch the app
iface.launch(inbrowser=True, share=True)