File size: 6,870 Bytes
0f0dabd 9736ac2 0f0dabd e1974fb a1bcbda 0f0dabd 9736ac2 0f0dabd 389fd8f 9736ac2 e1974fb 9736ac2 79cf235 0f0dabd 9736ac2 0f0dabd 79cf235 e1974fb 0f0dabd e1974fb 389fd8f 79cf235 e1974fb 389fd8f e1974fb f73fa89 f2ea075 f73fa89 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 a8ff1d1 f2ea075 f73fa89 f2ea075 a8ff1d1 f2ea075 fb15d4c a8ff1d1 fb15d4c 9673a18 a8ff1d1 fb15d4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
import re
from collections import Counter
import gradio as gr
def preprocess_text(text):
text = re.sub(r'[^\u0900-\u097F\s]', '', text)
text = ' '.join(text.split())
return text
def get_stats(vocab):
pairs = Counter()
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols) - 1):
pairs[symbols[i], symbols[i + 1]] += freq
return pairs
def merge_vocab(pair, v_in):
v_out = {}
bigram = ' '.join(pair)
replacement = ''.join(pair)
for word in v_in:
w_out = word.replace(bigram, replacement)
v_out[w_out] = v_in[word]
return v_out
def apply_bpe(text, bpe_codes):
word_list = text.split()
for pair, _ in bpe_codes:
if ' ' in pair:
p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
word_list = [p.sub(''.join(pair), word) for word in word_list]
return word_list
def perform_bpe(text):
preprocessed_text = preprocess_text(text)
original_size = len(preprocessed_text)
vocab = Counter(preprocessed_text.split())
vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
bpe_codes = []
while True:
pairs = get_stats(vocab)
if not pairs:
break
best = max(pairs, key=pairs.get)
vocab = merge_vocab(best, vocab)
bpe_codes.append((best, pairs[best]))
encoded_text = apply_bpe(preprocessed_text, bpe_codes)
compressed_size = len(encoded_text)
compression_ratio = original_size / compressed_size if compressed_size > 0 else 0
if len(vocab) >= 5000 and compression_ratio >= 3:
break
result = f"Vocabulary size: {len(vocab)}\n"
result += f"Original size: {original_size}\n"
result += f"Compressed size: {compressed_size}\n"
result += f"Compression ratio: {compression_ratio:.2f}X\n\n"
if len(vocab) >= 5000 and compression_ratio >= 3:
result += "Both criteria are met!"
elif len(vocab) >= 5000:
result += "Vocabulary size criterion is met, but compression ratio is below 3."
elif compression_ratio >= 3:
result += "Compression ratio criterion is met, but vocabulary size is below 5000."
else:
result += "Neither criterion is met."
return result, ' '.join(encoded_text)
def bpe_app(input_text):
if not input_text:
input_text = "नमस्ते! यह एक उदाहरण हिंदी वाक्य है। आप अपना खुद का पाठ यहां दर्ज कर सकते हैं।"
stats, encoded_text = perform_bpe(input_text)
return stats, encoded_text
# Custom CSS
custom_css = """
<style>
body {
font-family: 'Poppins', sans-serif;
background: linear-gradient(135deg, #1e3c72, #2a5298);
color: #ffffff;
}
.container {
max-width: 900px;
margin: 0 auto;
padding: 20px;
}
.gradio-container {
background-color: rgba(255, 255, 255, 0.1);
border-radius: 15px;
backdrop-filter: blur(10px);
box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
}
h1 {
color: #ffffff;
text-align: center;
font-size: 2.5em;
margin-bottom: 20px;
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
}
.gr-button {
background-color: #4CAF50 !important;
border: none !important;
color: white !important;
text-transform: uppercase;
font-weight: bold;
transition: all 0.3s ease;
}
.gr-button:hover {
background-color: #45a049 !important;
box-shadow: 0 5px 15px rgba(0,0,0,0.2);
transform: translateY(-2px);
}
.gr-form {
border-radius: 10px;
background-color: rgba(255, 255, 255, 0.1);
padding: 20px;
margin-bottom: 20px;
}
.gr-box {
border-radius: 8px;
border: 1px solid rgba(255, 255, 255, 0.2);
padding: 15px;
margin-top: 10px;
background-color: rgba(255, 255, 255, 0.05);
}
.gr-padded {
padding: 15px;
}
.gr-input, .gr-textarea {
background-color: rgba(255, 255, 255, 0.1) !important;
border: 2px solid rgba(255, 255, 255, 0.2) !important;
border-radius: 8px !important;
color: white !important;
}
.gr-input:focus, .gr-textarea:focus {
border-color: #4CAF50 !important;
box-shadow: 0 0 0 2px rgba(76, 175, 80, 0.2) !important;
}
#component-0 {
border-radius: 10px;
overflow: hidden;
}
.footer {
text-align: center;
margin-top: 20px;
font-size: 0.9em;
color: rgba(255, 255, 255, 0.7);
}
</style>
"""
# HTML Template
html_template = """
<div class="container">
<h1>🇮🇳 Byte Pair Encoding for Hindi</h1>
<p style="text-align: center; margin-bottom: 20px; color: rgba(255, 255, 255, 0.8);">
Compress and tokenize Hindi text using the BPE algorithm.
Enter your text below or use one of the examples provided.
</p>
</div>
"""
# Create Gradio interface with custom theme
with gr.Blocks(css=custom_css) as iface:
gr.HTML(html_template)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(lines=5, label="Input Hindi Text", placeholder="Enter Hindi text here or leave blank for an example")
with gr.Row():
submit_btn = gr.Button("Process", variant="primary")
with gr.Row():
with gr.Column():
output_stats = gr.Textbox(label="BPE Statistics")
with gr.Column():
output_encoded = gr.Textbox(label="Encoded Text")
gr.Markdown("The algorithm continues until it reaches a vocabulary size of 5000+ tokens and a compression ratio of 3 or above.")
examples = gr.Examples(
examples=[
["नमस्ते दुनिया! यह एक छोटा सा उदाहरण है।"],
["भारत एक विशाल और विविधतापूर्ण देश है, जहाँ कई भाषाएँ बोली जाती हैं।"],
["आज का मौसम बहुत सुहावना है। आकाश में बादल छाए हुए हैं और हल्की बारिश हो रही है।"]
],
inputs=[input_text],
)
submit_btn.click(bpe_app, inputs=[input_text], outputs=[output_stats, output_encoded])
gr.HTML('<div class="footer">Developed with ❤️ for Hindi language processing</div>')
# Launch the app
iface.launch(inbrowser=True, share=True) |