Spaces:

sagar007
/

BPE

Sleeping

App Files Files Community

sagar007 commited on Jun 21, 2024

Commit

9736ac2

verified ·

1 Parent(s): 20f38a4

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -58

app.py CHANGED Viewed

@@ -1,15 +1,31 @@
-import gradio as gr
 import re
 from collections import Counter
 def preprocess_text(text):
-    # Remove punctuation and special characters, keep Hindi characters and spaces
-    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
-    # Remove extra whitespace
-    text = ' '.join(text.split())
     return text
 def get_stats(vocab):
     pairs = Counter()
     for word, freq in vocab.items():
         symbols = word.split()
@@ -18,75 +34,80 @@ def get_stats(vocab):
     return pairs
 def merge_vocab(pair, v_in):
     v_out = {}
     bigram = ' '.join(pair)
     replacement = ''.join(pair)
     for word in v_in:
-        # Use regex to ensure whole-word replacement
-        w_out = re.sub(r'(?<!\S)' + re.escape(bigram) + r'(?!\S)', replacement, word)
         v_out[w_out] = v_in[word]
     return v_out
 def apply_bpe(text, bpe_codes):
     word_list = text.split()
     for pair, _ in bpe_codes:
-        p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
-        word_list = [p.sub(''.join(pair), word) for word in word_list]
-    return ' '.join(word_list)
-def bpe_process(input_text, target_vocab_size):
-    preprocessed_text = preprocess_text(input_text)
-    # Initialize vocabulary
     vocab = Counter(preprocessed_text.split())
-    # Perform BPE merges
     bpe_codes = []
-    while len(vocab) < target_vocab_size and len(vocab) > 1:
         pairs = get_stats(vocab)
         if not pairs:
             break
         best = max(pairs, key=pairs.get)
         vocab = merge_vocab(best, vocab)
         bpe_codes.append((best, pairs[best]))
-    # Apply BPE to the original text
     encoded_text = apply_bpe(preprocessed_text, bpe_codes)
-    # Calculate compression ratio
-    original_size = len(preprocessed_text.split())
-    compressed_size = len(encoded_text.split())
-    compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
-    # Check if criteria are met
-    criteria_met = {
-        "vocab_size_met": len(vocab) >= 5000,
-        "compression_ratio_met": compression_ratio >= 3
-    }
-    return (
-        encoded_text,
-        len(vocab),
-        compression_ratio,
-        criteria_met
-    )
-# Define the Gradio interface
-iface = gr.Interface(
-    fn=bpe_process,
-    inputs=[
-        gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here..."),
-        gr.Slider(minimum=1000, maximum=10000, step=100, value=6000, label="Target Vocabulary Size")
-    ],
-    outputs=[
-        gr.Textbox(label="Encoded Text"),
-        gr.Number(label="Vocabulary Size"),
-        gr.Number(label="Compression Ratio"),
-        gr.JSON(label="Criteria Met")
-    ],
-    title="Byte Pair Encoding (BPE) for Hindi",
-    description="Encode Hindi text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio."
-)
-# Launch the Gradio app
-iface.launch(share=True)

 import re
 from collections import Counter
+import gradio as gr
 def preprocess_text(text):
+    """Preprocesses Hindi text for BPE.
+    Args:
+        text (str): The Hindi text to preprocess.
+    Returns:
+        str: The preprocessed text.
+    """
+    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # Remove punctuation and special characters
+    text = ' '.join(text.split())  # Remove extra whitespace
     return text
 def get_stats(vocab):
+    """Gets bigram statistics for BPE merging.
+    Args:
+        vocab (Counter): The vocabulary of word frequencies.
+    Returns:
+        Counter: A counter of bigram frequencies.
+    """
     pairs = Counter()
     for word, freq in vocab.items():
         symbols = word.split()
     return pairs
 def merge_vocab(pair, v_in):
+    """Merges bigrams into single tokens in the vocabulary.
+    Args:
+        pair (tuple): The bigram to merge (word1, word2).
+        v_in (Counter): The input vocabulary.
+    Returns:
+        Counter: The updated vocabulary with the merged bigram.
+    """
     v_out = {}
     bigram = ' '.join(pair)
     replacement = ''.join(pair)
     for word in v_in:
+        w_out = word.replace(bigram, replacement)
         v_out[w_out] = v_in[word]
     return v_out
 def apply_bpe(text, bpe_codes):
+    """Applies BPE to a preprocessed text.
+    Args:
+        text (str): The preprocessed text.
+        bpe_codes (list): A list of bigram pairs for merging.
+    Returns:
+        list: The encoded text as a list of tokens.
+    """
     word_list = text.split()
     for pair, _ in bpe_codes:
+        if ' ' in pair:
+            p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
+            word_list = [p.sub(''.join(pair), word) for word in word_list]
+    return word_list
+def bpe_process(text, target_vocab_size=6000):
+    """Performs BPE on Hindi text.
+    Args:
+        text (str): The Hindi text to encode.
+        target_vocab_size (int, optional): The target vocabulary size. Defaults to 6000.
+    Returns:
+        tuple: A tuple containing the encoded text, vocabulary size, and compression ratio.
+    """
+    preprocessed_text = preprocess_text(text)
     vocab = Counter(preprocessed_text.split())
+    vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
+    vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
     bpe_codes = []
+    while len(vocab) < target_vocab_size:
         pairs = get_stats(vocab)
         if not pairs:
             break
         best = max(pairs, key=pairs.get)
         vocab = merge_vocab(best, vocab)
         bpe_codes.append((best, pairs[best]))
     encoded_text = apply_bpe(preprocessed_text, bpe_codes)
+    original_size = len(preprocessed_text)
+    compressed_size = len(encoded_text)
+    compression_ratio = original_size / compressed_size
+    return encoded_text, len(vocab), compression_ratio
+def gradio_demo():
+    """Creates a Gradio app for BPE in Hindi."""
+    iface = gr.Interface(
+        fn=bpe_process,
+        inputs="textbox",
+        outputs=["text", "label", "label"],
+        title="Hindi Byte Pair Encoding (BPE)",
+        description="Enter Hindi text and see