File size: 6,870 Bytes
0f0dabd
 
9736ac2
0f0dabd
 
e1974fb
 
a1bcbda
0f0dabd
 
 
 
 
 
 
 
 
 
 
 
 
 
9736ac2
0f0dabd
 
 
 
 
 
389fd8f
 
 
 
9736ac2
e1974fb
9736ac2
79cf235
0f0dabd
9736ac2
 
0f0dabd
79cf235
e1974fb
0f0dabd
 
 
 
 
 
e1974fb
 
389fd8f
79cf235
e1974fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389fd8f
e1974fb
f73fa89
 
 
 
 
f2ea075
f73fa89
f2ea075
 
 
a8ff1d1
 
 
f2ea075
 
 
 
 
 
 
a8ff1d1
f2ea075
a8ff1d1
 
f2ea075
 
a8ff1d1
f2ea075
 
 
a8ff1d1
f2ea075
 
a8ff1d1
f2ea075
a8ff1d1
 
 
 
f2ea075
 
a8ff1d1
 
 
f2ea075
 
 
a8ff1d1
 
 
f2ea075
 
 
a8ff1d1
f2ea075
 
a8ff1d1
f2ea075
 
 
 
 
a8ff1d1
 
f2ea075
a8ff1d1
f2ea075
 
a8ff1d1
 
f2ea075
 
 
 
 
a8ff1d1
 
 
 
 
 
f2ea075
 
 
 
f73fa89
f2ea075
 
a8ff1d1
f2ea075
 
 
 
 
 
 
fb15d4c
 
 
 
 
 
 
 
a8ff1d1
fb15d4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9673a18
a8ff1d1
 
fb15d4c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import re
from collections import Counter
import gradio as gr

def preprocess_text(text):
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
    text = ' '.join(text.split())
    return text

def get_stats(vocab):
    pairs = Counter()
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    for word in v_in:
        w_out = word.replace(bigram, replacement)
        v_out[w_out] = v_in[word]
    return v_out

def apply_bpe(text, bpe_codes):
    word_list = text.split()
    for pair, _ in bpe_codes:
        if ' ' in pair:
            p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
            word_list = [p.sub(''.join(pair), word) for word in word_list]
    return word_list

def perform_bpe(text):
    preprocessed_text = preprocess_text(text)
    original_size = len(preprocessed_text)
    vocab = Counter(preprocessed_text.split())
    vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
    vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
    bpe_codes = []
    
    while True:
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        bpe_codes.append((best, pairs[best]))
        
        encoded_text = apply_bpe(preprocessed_text, bpe_codes)
        compressed_size = len(encoded_text)
        compression_ratio = original_size / compressed_size if compressed_size > 0 else 0
        
        if len(vocab) >= 5000 and compression_ratio >= 3:
            break
    
    result = f"Vocabulary size: {len(vocab)}\n"
    result += f"Original size: {original_size}\n"
    result += f"Compressed size: {compressed_size}\n"
    result += f"Compression ratio: {compression_ratio:.2f}X\n\n"
    
    if len(vocab) >= 5000 and compression_ratio >= 3:
        result += "Both criteria are met!"
    elif len(vocab) >= 5000:
        result += "Vocabulary size criterion is met, but compression ratio is below 3."
    elif compression_ratio >= 3:
        result += "Compression ratio criterion is met, but vocabulary size is below 5000."
    else:
        result += "Neither criterion is met."
    
    return result, ' '.join(encoded_text)

def bpe_app(input_text):
    if not input_text:
        input_text = "नमस्ते! यह एक उदाहरण हिंदी वाक्य है। आप अपना खुद का पाठ यहां दर्ज कर सकते हैं।"
    stats, encoded_text = perform_bpe(input_text)
    return stats, encoded_text

# Custom CSS
custom_css = """
<style>
    body {
        font-family: 'Poppins', sans-serif;
        background: linear-gradient(135deg, #1e3c72, #2a5298);
        color: #ffffff;
    }
    .container {
        max-width: 900px;
        margin: 0 auto;
        padding: 20px;
    }
    .gradio-container {
        background-color: rgba(255, 255, 255, 0.1);
        border-radius: 15px;
        backdrop-filter: blur(10px);
        box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
    }
    h1 {
        color: #ffffff;
        text-align: center;
        font-size: 2.5em;
        margin-bottom: 20px;
        text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
    }
    .gr-button {
        background-color: #4CAF50 !important;
        border: none !important;
        color: white !important;
        text-transform: uppercase;
        font-weight: bold;
        transition: all 0.3s ease;
    }
    .gr-button:hover {
        background-color: #45a049 !important;
        box-shadow: 0 5px 15px rgba(0,0,0,0.2);
        transform: translateY(-2px);
    }
    .gr-form {
        border-radius: 10px;
        background-color: rgba(255, 255, 255, 0.1);
        padding: 20px;
        margin-bottom: 20px;
    }
    .gr-box {
        border-radius: 8px;
        border: 1px solid rgba(255, 255, 255, 0.2);
        padding: 15px;
        margin-top: 10px;
        background-color: rgba(255, 255, 255, 0.05);
    }
    .gr-padded {
        padding: 15px;
    }
    .gr-input, .gr-textarea {
        background-color: rgba(255, 255, 255, 0.1) !important;
        border: 2px solid rgba(255, 255, 255, 0.2) !important;
        border-radius: 8px !important;
        color: white !important;
    }
    .gr-input:focus, .gr-textarea:focus {
        border-color: #4CAF50 !important;
        box-shadow: 0 0 0 2px rgba(76, 175, 80, 0.2) !important;
    }
    #component-0 {
        border-radius: 10px;
        overflow: hidden;
    }
    .footer {
        text-align: center;
        margin-top: 20px;
        font-size: 0.9em;
        color: rgba(255, 255, 255, 0.7);
    }
</style>
"""

# HTML Template
html_template = """
<div class="container">
    <h1>🇮🇳 Byte Pair Encoding for Hindi</h1>
    <p style="text-align: center; margin-bottom: 20px; color: rgba(255, 255, 255, 0.8);">
        Compress and tokenize Hindi text using the BPE algorithm. 
        Enter your text below or use one of the examples provided.
    </p>
</div>
"""

# Create Gradio interface with custom theme
with gr.Blocks(css=custom_css) as iface:
    gr.HTML(html_template)
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(lines=5, label="Input Hindi Text", placeholder="Enter Hindi text here or leave blank for an example")
    
    with gr.Row():
        submit_btn = gr.Button("Process", variant="primary")
    
    with gr.Row():
        with gr.Column():
            output_stats = gr.Textbox(label="BPE Statistics")
        with gr.Column():
            output_encoded = gr.Textbox(label="Encoded Text")
    
    gr.Markdown("The algorithm continues until it reaches a vocabulary size of 5000+ tokens and a compression ratio of 3 or above.")
    
    examples = gr.Examples(
        examples=[
            ["नमस्ते दुनिया! यह एक छोटा सा उदाहरण है।"],
            ["भारत एक विशाल और विविधतापूर्ण देश है, जहाँ कई भाषाएँ बोली जाती हैं।"],
            ["आज का मौसम बहुत सुहावना है। आकाश में बादल छाए हुए हैं और हल्की बारिश हो रही है।"]
        ],
        inputs=[input_text],
    )
    
    submit_btn.click(bpe_app, inputs=[input_text], outputs=[output_stats, output_encoded])

    gr.HTML('<div class="footer">Developed with ❤️ for Hindi language processing</div>')

# Launch the app
iface.launch(inbrowser=True, share=True)