Spaces:

sagar007
/

BPE

Build error

App Files Files Community

sagar007 commited on Jun 21, 2024

Commit

e1974fb

verified ·

1 Parent(s): 9673a18

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -73

app.py CHANGED Viewed

@@ -3,29 +3,11 @@ from collections import Counter
 import gradio as gr
 def preprocess_text(text):
-    """Preprocesses Hindi text for BPE.
-    Args:
-        text (str): The Hindi text to preprocess.
-    Returns:
-        str: The preprocessed text.
-    """
-    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # Remove punctuation and special characters
-    text = ' '.join(text.split())  # Remove extra whitespace
     return text
 def get_stats(vocab):
-    """Gets bigram statistics for BPE merging.
-    Args:
-        vocab (Counter): The vocabulary of word frequencies.
-    Returns:
-        Counter: A counter of bigram frequencies.
-    """
     pairs = Counter()
     for word, freq in vocab.items():
         symbols = word.split()
@@ -34,16 +16,6 @@ def get_stats(vocab):
     return pairs
 def merge_vocab(pair, v_in):
-    """Merges bigrams into single tokens in the vocabulary.
-    Args:
-        pair (tuple): The bigram to merge (word1, word2).
-        v_in (Counter): The input vocabulary.
-    Returns:
-        Counter: The updated vocabulary with the merged bigram.
-    """
     v_out = {}
     bigram = ' '.join(pair)
     replacement = ''.join(pair)
@@ -53,16 +25,6 @@ def merge_vocab(pair, v_in):
     return v_out
 def apply_bpe(text, bpe_codes):
-    """Applies BPE to a preprocessed text.
-    Args:
-        text (str): The preprocessed text.
-        bpe_codes (list): A list of bigram pairs for merging.
-    Returns:
-        list: The encoded text as a list of tokens.
-    """
     word_list = text.split()
     for pair, _ in bpe_codes:
         if ' ' in pair:
@@ -70,49 +32,61 @@ def apply_bpe(text, bpe_codes):
             word_list = [p.sub(''.join(pair), word) for word in word_list]
     return word_list
-def bpe_process(text, target_vocab_size=6000):
-    """Performs BPE on Hindi text.
-    Args:
-        text (str): The Hindi text to encode.
-        target_vocab_size (int, optional): The target vocabulary size. Defaults to 6000.
-    Returns:
-        tuple: A tuple containing the encoded text, vocabulary size, and compression ratio.
-    """
     preprocessed_text = preprocess_text(text)
     vocab = Counter(preprocessed_text.split())
     vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
     vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
     bpe_codes = []
-    while len(vocab) < target_vocab_size:
         pairs = get_stats(vocab)
         if not pairs:
             break
         best = max(pairs, key=pairs.get)
         vocab = merge_vocab(best, vocab)
         bpe_codes.append((best, pairs[best]))
-    encoded_text = apply_bpe(preprocessed_text, bpe_codes)
-    original_size = len(preprocessed_text)
-    compressed_size = len(encoded_text)
-    compression_ratio = original_size / compressed_size
-    return encoded_text, len(vocab), compression_ratio
-def gradio_demo():
-    """Creates a Gradio app for BPE in Hindi."""
-    iface = gr.Interface(
-        fn=bpe_process,
-        inputs="textbox",
-        outputs=["text", "label", "label"],
-        title="Hindi Byte Pair Encoding (BPE)",
-        description="Enter Hindi text and see"
 )
-# Launch the Gradio app
-iface.launch(share=True)

 import gradio as gr
 def preprocess_text(text):
+    text = re.sub(r'[^\u0900-\u097F\s]', '', text)
+    text = ' '.join(text.split())
     return text
 def get_stats(vocab):
     pairs = Counter()
     for word, freq in vocab.items():
         symbols = word.split()
     return pairs
 def merge_vocab(pair, v_in):
     v_out = {}
     bigram = ' '.join(pair)
     replacement = ''.join(pair)
     return v_out
 def apply_bpe(text, bpe_codes):
     word_list = text.split()
     for pair, _ in bpe_codes:
         if ' ' in pair:
             word_list = [p.sub(''.join(pair), word) for word in word_list]
     return word_list
+def perform_bpe(text):
     preprocessed_text = preprocess_text(text)
     vocab = Counter(preprocessed_text.split())
     vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
     vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
     bpe_codes = []
+    while True:
         pairs = get_stats(vocab)
         if not pairs:
             break
         best = max(pairs, key=pairs.get)
         vocab = merge_vocab(best, vocab)
         bpe_codes.append((best, pairs[best]))
+        encoded_text = apply_bpe(preprocessed_text, bpe_codes)
+        original_size = len(preprocessed_text)
+        compressed_size = len(encoded_text)
+        compression_ratio = original_size / compressed_size
+        if len(vocab) >= 5000 and compression_ratio >= 3:
+            break
+    result = f"Vocabulary size: {len(vocab)}\n"
+    result += f"Original size: {original_size}\n"
+    result += f"Compressed size: {compressed_size}\n"
+    result += f"Compression ratio: {compression_ratio:.2f}X\n\n"
+    if len(vocab) >= 5000 and compression_ratio >= 3:
+        result += "Both criteria are met!"
+    elif len(vocab) >= 5000:
+        result += "Vocabulary size criterion is met, but compression ratio is below 3."
+    elif compression_ratio >= 3:
+        result += "Compression ratio criterion is met, but vocabulary size is below 5000."
+    else:
+        result += "Neither criterion is met."
+    return result, ' '.join(encoded_text)
+def bpe_app(input_text):
+    stats, encoded_text = perform_bpe(input_text)
+    return stats, encoded_text
+iface = gr.Interface(
+    fn=bpe_app,
+    inputs=[
+        gr.Textbox(lines=5, label="Input Hindi Text")
+    ],
+    outputs=[
+        gr.Textbox(label="BPE Statistics"),
+        gr.Textbox(label="Encoded Text")
+    ],
+    title="Byte Pair Encoding (BPE) for Hindi Text",
+    description="Enter Hindi text to perform BPE encoding. The algorithm will continue until it reaches a vocabulary size of 5000+ tokens and a compression ratio of 3 or above."
 )
+iface.launch()