Spaces:

sagar007
/

BPE

Sleeping

BPE

File size: 6,870 Bytes

0f0dabd
 
9736ac2
0f0dabd
 
e1974fb
 
a1bcbda
0f0dabd
 
 
 
 
 
 
 
 
 
 
 
 
 
9736ac2
0f0dabd
 
 
 
 
 
389fd8f
 
 
 
9736ac2
e1974fb
9736ac2
79cf235
0f0dabd
9736ac2
 
0f0dabd
79cf235
e1974fb
0f0dabd
 
 
 
 
 
e1974fb
 
389fd8f
79cf235
e1974fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389fd8f
e1974fb
f73fa89
 
 
 
 
f2ea075
f73fa89
f2ea075
 
 
a8ff1d1
 
 
f2ea075
 
 
 
 
 
 
a8ff1d1
f2ea075
a8ff1d1
 
f2ea075
 
a8ff1d1
f2ea075
 
 
a8ff1d1
f2ea075
 
a8ff1d1
f2ea075
a8ff1d1
 
 
 
f2ea075
 
a8ff1d1
 
 
f2ea075
 
 
a8ff1d1
 
 
f2ea075
 
 
a8ff1d1
f2ea075
 
a8ff1d1
f2ea075
 
 
 
 
a8ff1d1
 
f2ea075
a8ff1d1
f2ea075
 
a8ff1d1
 
f2ea075
 
 
 
 
a8ff1d1
 
 
 
 
 
f2ea075
 
 
 
f73fa89
f2ea075
 
a8ff1d1
f2ea075
 
 
 
 
 
 
fb15d4c
 
 
 
 
 
 
 
a8ff1d1
fb15d4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9673a18
a8ff1d1
 
fb15d4c

import re
from collections import Counter
import gradio as gr

def preprocess_text(text):
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
    text = ' '.join(text.split())
    return text

def get_stats(vocab):
    pairs = Counter()
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    for word in v_in:
        w_out = word.replace(bigram, replacement)
        v_out[w_out] = v_in[word]
    return v_out

def apply_bpe(text, bpe_codes):
    word_list = text.split()
    for pair, _ in bpe_codes:
        if ' ' in pair:
            p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
            word_list = [p.sub(''.join(pair), word) for word in word_list]
    return word_list

def perform_bpe(text):
    preprocessed_text = preprocess_text(text)
    original_size = len(preprocessed_text)
    vocab = Counter(preprocessed_text.split())
    vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
    vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
    bpe_codes = []
    
    while True:
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        bpe_codes.append((best, pairs[best]))
        
        encoded_text = apply_bpe(preprocessed_text, bpe_codes)
        compressed_size = len(encoded_text)
        compression_ratio = original_size / compressed_size if compressed_size > 0 else 0
        
        if len(vocab) >= 5000 and compression_ratio >= 3:
            break
    
    result = f"Vocabulary size: {len(vocab)}\n"
    result += f"Original size: {original_size}\n"
    result += f"Compressed size: {compressed_size}\n"
    result += f"Compression ratio: {compression_ratio:.2f}X\n\n"
    
    if len(vocab) >= 5000 and compression_ratio >= 3:
        result += "Both criteria are met!"
    elif len(vocab) >= 5000:
        result += "Vocabulary size criterion is met, but compression ratio is below 3."
    elif compression_ratio >= 3:
        result += "Compression ratio criterion is met, but vocabulary size is below 5000."
    else:
        result += "Neither criterion is met."
    
    return result, ' '.join(encoded_text)

def bpe_app(input_text):
    if not input_text:
        input_text = "नमस्ते! यह एक उदाहरण हिंदी वाक्य है। आप अपना खुद का पाठ यहां दर्ज कर सकते हैं।"
    stats, encoded_text = perform_bpe(input_text)
    return stats, encoded_text

# Custom CSS
custom_css = """
<style>
    body {
        font-family: 'Poppins', sans-serif;
        background: linear-gradient(135deg, #1e3c72, #2a5298);
        color: #ffffff;
    }
    .container {
        max-width: 900px;
        margin: 0 auto;
        padding: 20px;
    }
    .gradio-container {
        background-color: rgba(255, 255, 255, 0.1);
        border-radius: 15px;
        backdrop-filter: blur(10px);
        box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
    }
    h1 {
        color: #ffffff;
        text-align: center;
        font-size: 2.5em;
        margin-bottom: 20px;
        text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
    }
    .gr-button {
        background-color: #4CAF50 !important;
        border: none !important;
        color: white !important;
        text-transform: uppercase;
        font-weight: bold;
        transition: all 0.3s ease;
    }
    .gr-button:hover {
        background-color: #45a049 !important;
        box-shadow: 0 5px 15px rgba(0,0,0,0.2);
        transform: translateY(-2px);
    }
    .gr-form {
        border-radius: 10px;
        background-color: rgba(255, 255, 255, 0.1);
        padding: 20px;
        margin-bottom: 20px;
    }
    .gr-box {
        border-radius: 8px;
        border: 1px solid rgba(255, 255, 255, 0.2);
        padding: 15px;
        margin-top: 10px;
        background-color: rgba(255, 255, 255, 0.05);
    }
    .gr-padded {
        padding: 15px;
    }
    .gr-input, .gr-textarea {
        background-color: rgba(255, 255, 255, 0.1) !important;
        border: 2px solid rgba(255, 255, 255, 0.2) !important;
        border-radius: 8px !important;
        color: white !important;
    }
    .gr-input:focus, .gr-textarea:focus {
        border-color: #4CAF50 !important;
        box-shadow: 0 0 0 2px rgba(76, 175, 80, 0.2) !important;
    }
    #component-0 {
        border-radius: 10px;
        overflow: hidden;
    }
    .footer {
        text-align: center;
        margin-top: 20px;
        font-size: 0.9em;
        color: rgba(255, 255, 255, 0.7);
    }
</style>
"""

# HTML Template
html_template = """
<div class="container">
    <h1>🇮🇳 Byte Pair Encoding for Hindi</h1>
    <p style="text-align: center; margin-bottom: 20px; color: rgba(255, 255, 255, 0.8);">
        Compress and tokenize Hindi text using the BPE algorithm. 
        Enter your text below or use one of the examples provided.
    </p>
</div>
"""

# Create Gradio interface with custom theme
with gr.Blocks(css=custom_css) as iface:
    gr.HTML(html_template)
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(lines=5, label="Input Hindi Text", placeholder="Enter Hindi text here or leave blank for an example")
    
    with gr.Row():
        submit_btn = gr.Button("Process", variant="primary")
    
    with gr.Row():
        with gr.Column():
            output_stats = gr.Textbox(label="BPE Statistics")
        with gr.Column():
            output_encoded = gr.Textbox(label="Encoded Text")
    
    gr.Markdown("The algorithm continues until it reaches a vocabulary size of 5000+ tokens and a compression ratio of 3 or above.")
    
    examples = gr.Examples(
        examples=[
            ["नमस्ते दुनिया! यह एक छोटा सा उदाहरण है।"],
            ["भारत एक विशाल और विविधतापूर्ण देश है, जहाँ कई भाषाएँ बोली जाती हैं।"],
            ["आज का मौसम बहुत सुहावना है। आकाश में बादल छाए हुए हैं और हल्की बारिश हो रही है।"]
        ],
        inputs=[input_text],
    )
    
    submit_btn.click(bpe_app, inputs=[input_text], outputs=[output_stats, output_encoded])

    gr.HTML('<div class="footer">Developed with ❤️ for Hindi language processing</div>')

# Launch the app
iface.launch(inbrowser=True, share=True)