Update app.py
Browse files
app.py
CHANGED
|
@@ -3,8 +3,11 @@ import re
|
|
| 3 |
from collections import Counter
|
| 4 |
|
| 5 |
def preprocess_text(text):
|
|
|
|
| 6 |
text = re.sub(r'[^\u0900-\u097F\s]', '', text)
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def get_stats(vocab):
|
| 10 |
pairs = Counter()
|
|
@@ -29,12 +32,12 @@ def apply_bpe(text, bpe_codes):
|
|
| 29 |
if ' ' in pair:
|
| 30 |
p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
|
| 31 |
word_list = [p.sub(''.join(pair), word) for word in word_list]
|
| 32 |
-
return word_list
|
| 33 |
|
| 34 |
def bpe_process(input_text, target_vocab_size):
|
| 35 |
preprocessed_text = preprocess_text(input_text)
|
| 36 |
|
| 37 |
-
# Initialize vocabulary
|
| 38 |
vocab = Counter(preprocessed_text.split())
|
| 39 |
vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
|
| 40 |
vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
|
|
@@ -53,9 +56,9 @@ def bpe_process(input_text, target_vocab_size):
|
|
| 53 |
encoded_text = apply_bpe(preprocessed_text, bpe_codes)
|
| 54 |
|
| 55 |
# Calculate compression ratio
|
| 56 |
-
original_size = len(preprocessed_text
|
| 57 |
compressed_size = len(encoded_text)
|
| 58 |
-
compression_ratio = original_size / compressed_size
|
| 59 |
|
| 60 |
# Check if criteria are met
|
| 61 |
criteria_met = {
|
|
@@ -64,7 +67,7 @@ def bpe_process(input_text, target_vocab_size):
|
|
| 64 |
}
|
| 65 |
|
| 66 |
return (
|
| 67 |
-
|
| 68 |
len(vocab),
|
| 69 |
compression_ratio,
|
| 70 |
criteria_met
|
|
@@ -88,4 +91,4 @@ iface = gr.Interface(
|
|
| 88 |
)
|
| 89 |
|
| 90 |
# Launch the Gradio app
|
| 91 |
-
iface.launch(share=True)
|
|
|
|
| 3 |
from collections import Counter
|
| 4 |
|
| 5 |
def preprocess_text(text):
|
| 6 |
+
# Remove punctuation and special characters, keep Hindi characters and spaces
|
| 7 |
text = re.sub(r'[^\u0900-\u097F\s]', '', text)
|
| 8 |
+
# Remove extra whitespace
|
| 9 |
+
text = ' '.join(text.split())
|
| 10 |
+
return text
|
| 11 |
|
| 12 |
def get_stats(vocab):
|
| 13 |
pairs = Counter()
|
|
|
|
| 32 |
if ' ' in pair:
|
| 33 |
p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
|
| 34 |
word_list = [p.sub(''.join(pair), word) for word in word_list]
|
| 35 |
+
return ' '.join(word_list)
|
| 36 |
|
| 37 |
def bpe_process(input_text, target_vocab_size):
|
| 38 |
preprocessed_text = preprocess_text(input_text)
|
| 39 |
|
| 40 |
+
# Initialize vocabulary with character-level tokens and common subwords
|
| 41 |
vocab = Counter(preprocessed_text.split())
|
| 42 |
vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
|
| 43 |
vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
|
|
|
|
| 56 |
encoded_text = apply_bpe(preprocessed_text, bpe_codes)
|
| 57 |
|
| 58 |
# Calculate compression ratio
|
| 59 |
+
original_size = len(preprocessed_text)
|
| 60 |
compressed_size = len(encoded_text)
|
| 61 |
+
compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
|
| 62 |
|
| 63 |
# Check if criteria are met
|
| 64 |
criteria_met = {
|
|
|
|
| 67 |
}
|
| 68 |
|
| 69 |
return (
|
| 70 |
+
encoded_text,
|
| 71 |
len(vocab),
|
| 72 |
compression_ratio,
|
| 73 |
criteria_met
|
|
|
|
| 91 |
)
|
| 92 |
|
| 93 |
# Launch the Gradio app
|
| 94 |
+
iface.launch(share=True)
|