|
|
import re |
|
|
from collections import Counter |
|
|
import gradio as gr |
|
|
|
|
|
def preprocess_text(text): |
|
|
text = re.sub(r'[^\u0900-\u097F\s]', '', text) |
|
|
text = ' '.join(text.split()) |
|
|
return text |
|
|
|
|
|
def get_stats(vocab): |
|
|
pairs = Counter() |
|
|
for word, freq in vocab.items(): |
|
|
symbols = word.split() |
|
|
for i in range(len(symbols) - 1): |
|
|
pairs[symbols[i], symbols[i + 1]] += freq |
|
|
return pairs |
|
|
|
|
|
def merge_vocab(pair, v_in): |
|
|
v_out = {} |
|
|
bigram = ' '.join(pair) |
|
|
replacement = ''.join(pair) |
|
|
for word in v_in: |
|
|
w_out = word.replace(bigram, replacement) |
|
|
v_out[w_out] = v_in[word] |
|
|
return v_out |
|
|
|
|
|
def apply_bpe(text, bpe_codes): |
|
|
word_list = text.split() |
|
|
for pair, _ in bpe_codes: |
|
|
if ' ' in pair: |
|
|
p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)') |
|
|
word_list = [p.sub(''.join(pair), word) for word in word_list] |
|
|
return word_list |
|
|
|
|
|
def perform_bpe(text): |
|
|
preprocessed_text = preprocess_text(text) |
|
|
original_size = len(preprocessed_text) |
|
|
vocab = Counter(preprocessed_text.split()) |
|
|
vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)])) |
|
|
vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)])) |
|
|
bpe_codes = [] |
|
|
|
|
|
while True: |
|
|
pairs = get_stats(vocab) |
|
|
if not pairs: |
|
|
break |
|
|
best = max(pairs, key=pairs.get) |
|
|
vocab = merge_vocab(best, vocab) |
|
|
bpe_codes.append((best, pairs[best])) |
|
|
|
|
|
encoded_text = apply_bpe(preprocessed_text, bpe_codes) |
|
|
compressed_size = len(encoded_text) |
|
|
compression_ratio = original_size / compressed_size if compressed_size > 0 else 0 |
|
|
|
|
|
if len(vocab) >= 5000 and compression_ratio >= 3: |
|
|
break |
|
|
|
|
|
result = f"Vocabulary size: {len(vocab)}\n" |
|
|
result += f"Original size: {original_size}\n" |
|
|
result += f"Compressed size: {compressed_size}\n" |
|
|
result += f"Compression ratio: {compression_ratio:.2f}X\n\n" |
|
|
|
|
|
if len(vocab) >= 5000 and compression_ratio >= 3: |
|
|
result += "Both criteria are met!" |
|
|
elif len(vocab) >= 5000: |
|
|
result += "Vocabulary size criterion is met, but compression ratio is below 3." |
|
|
elif compression_ratio >= 3: |
|
|
result += "Compression ratio criterion is met, but vocabulary size is below 5000." |
|
|
else: |
|
|
result += "Neither criterion is met." |
|
|
|
|
|
return result, ' '.join(encoded_text) |
|
|
|
|
|
def bpe_app(input_text): |
|
|
if not input_text: |
|
|
input_text = "नमस्ते! यह एक उदाहरण हिंदी वाक्य है। आप अपना खुद का पाठ यहां दर्ज कर सकते हैं।" |
|
|
stats, encoded_text = perform_bpe(input_text) |
|
|
return stats, encoded_text |
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
<style> |
|
|
body { |
|
|
font-family: 'Poppins', sans-serif; |
|
|
background: linear-gradient(135deg, #1e3c72, #2a5298); |
|
|
color: #ffffff; |
|
|
} |
|
|
.container { |
|
|
max-width: 900px; |
|
|
margin: 0 auto; |
|
|
padding: 20px; |
|
|
} |
|
|
.gradio-container { |
|
|
background-color: rgba(255, 255, 255, 0.1); |
|
|
border-radius: 15px; |
|
|
backdrop-filter: blur(10px); |
|
|
box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2); |
|
|
} |
|
|
h1 { |
|
|
color: #ffffff; |
|
|
text-align: center; |
|
|
font-size: 2.5em; |
|
|
margin-bottom: 20px; |
|
|
text-shadow: 2px 2px 4px rgba(0,0,0,0.3); |
|
|
} |
|
|
.gr-button { |
|
|
background-color: #4CAF50 !important; |
|
|
border: none !important; |
|
|
color: white !important; |
|
|
text-transform: uppercase; |
|
|
font-weight: bold; |
|
|
transition: all 0.3s ease; |
|
|
} |
|
|
.gr-button:hover { |
|
|
background-color: #45a049 !important; |
|
|
box-shadow: 0 5px 15px rgba(0,0,0,0.2); |
|
|
transform: translateY(-2px); |
|
|
} |
|
|
.gr-form { |
|
|
border-radius: 10px; |
|
|
background-color: rgba(255, 255, 255, 0.1); |
|
|
padding: 20px; |
|
|
margin-bottom: 20px; |
|
|
} |
|
|
.gr-box { |
|
|
border-radius: 8px; |
|
|
border: 1px solid rgba(255, 255, 255, 0.2); |
|
|
padding: 15px; |
|
|
margin-top: 10px; |
|
|
background-color: rgba(255, 255, 255, 0.05); |
|
|
} |
|
|
.gr-padded { |
|
|
padding: 15px; |
|
|
} |
|
|
.gr-input, .gr-textarea { |
|
|
background-color: rgba(255, 255, 255, 0.1) !important; |
|
|
border: 2px solid rgba(255, 255, 255, 0.2) !important; |
|
|
border-radius: 8px !important; |
|
|
color: white !important; |
|
|
} |
|
|
.gr-input:focus, .gr-textarea:focus { |
|
|
border-color: #4CAF50 !important; |
|
|
box-shadow: 0 0 0 2px rgba(76, 175, 80, 0.2) !important; |
|
|
} |
|
|
#component-0 { |
|
|
border-radius: 10px; |
|
|
overflow: hidden; |
|
|
} |
|
|
.footer { |
|
|
text-align: center; |
|
|
margin-top: 20px; |
|
|
font-size: 0.9em; |
|
|
color: rgba(255, 255, 255, 0.7); |
|
|
} |
|
|
</style> |
|
|
""" |
|
|
|
|
|
|
|
|
html_template = """ |
|
|
<div class="container"> |
|
|
<h1>🇮🇳 Byte Pair Encoding for Hindi</h1> |
|
|
<p style="text-align: center; margin-bottom: 20px; color: rgba(255, 255, 255, 0.8);"> |
|
|
Compress and tokenize Hindi text using the BPE algorithm. |
|
|
Enter your text below or use one of the examples provided. |
|
|
</p> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
|
|
|
with gr.Blocks(css=custom_css) as iface: |
|
|
gr.HTML(html_template) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
input_text = gr.Textbox(lines=5, label="Input Hindi Text", placeholder="Enter Hindi text here or leave blank for an example") |
|
|
|
|
|
with gr.Row(): |
|
|
submit_btn = gr.Button("Process", variant="primary") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
output_stats = gr.Textbox(label="BPE Statistics") |
|
|
with gr.Column(): |
|
|
output_encoded = gr.Textbox(label="Encoded Text") |
|
|
|
|
|
gr.Markdown("The algorithm continues until it reaches a vocabulary size of 5000+ tokens and a compression ratio of 3 or above.") |
|
|
|
|
|
examples = gr.Examples( |
|
|
examples=[ |
|
|
["नमस्ते दुनिया! यह एक छोटा सा उदाहरण है।"], |
|
|
["भारत एक विशाल और विविधतापूर्ण देश है, जहाँ कई भाषाएँ बोली जाती हैं।"], |
|
|
["आज का मौसम बहुत सुहावना है। आकाश में बादल छाए हुए हैं और हल्की बारिश हो रही है।"] |
|
|
], |
|
|
inputs=[input_text], |
|
|
) |
|
|
|
|
|
submit_btn.click(bpe_app, inputs=[input_text], outputs=[output_stats, output_encoded]) |
|
|
|
|
|
gr.HTML('<div class="footer">Developed with ❤️ for Hindi language processing</div>') |
|
|
|
|
|
|
|
|
iface.launch(inbrowser=True, share=True) |