sagar007 commited on
Commit
e1974fb
·
verified ·
1 Parent(s): 9673a18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -73
app.py CHANGED
@@ -3,29 +3,11 @@ from collections import Counter
3
  import gradio as gr
4
 
5
  def preprocess_text(text):
6
- """Preprocesses Hindi text for BPE.
7
-
8
- Args:
9
- text (str): The Hindi text to preprocess.
10
-
11
- Returns:
12
- str: The preprocessed text.
13
- """
14
-
15
- text = re.sub(r'[^\u0900-\u097F\s]', '', text) # Remove punctuation and special characters
16
- text = ' '.join(text.split()) # Remove extra whitespace
17
  return text
18
 
19
  def get_stats(vocab):
20
- """Gets bigram statistics for BPE merging.
21
-
22
- Args:
23
- vocab (Counter): The vocabulary of word frequencies.
24
-
25
- Returns:
26
- Counter: A counter of bigram frequencies.
27
- """
28
-
29
  pairs = Counter()
30
  for word, freq in vocab.items():
31
  symbols = word.split()
@@ -34,16 +16,6 @@ def get_stats(vocab):
34
  return pairs
35
 
36
  def merge_vocab(pair, v_in):
37
- """Merges bigrams into single tokens in the vocabulary.
38
-
39
- Args:
40
- pair (tuple): The bigram to merge (word1, word2).
41
- v_in (Counter): The input vocabulary.
42
-
43
- Returns:
44
- Counter: The updated vocabulary with the merged bigram.
45
- """
46
-
47
  v_out = {}
48
  bigram = ' '.join(pair)
49
  replacement = ''.join(pair)
@@ -53,16 +25,6 @@ def merge_vocab(pair, v_in):
53
  return v_out
54
 
55
  def apply_bpe(text, bpe_codes):
56
- """Applies BPE to a preprocessed text.
57
-
58
- Args:
59
- text (str): The preprocessed text.
60
- bpe_codes (list): A list of bigram pairs for merging.
61
-
62
- Returns:
63
- list: The encoded text as a list of tokens.
64
- """
65
-
66
  word_list = text.split()
67
  for pair, _ in bpe_codes:
68
  if ' ' in pair:
@@ -70,49 +32,61 @@ def apply_bpe(text, bpe_codes):
70
  word_list = [p.sub(''.join(pair), word) for word in word_list]
71
  return word_list
72
 
73
- def bpe_process(text, target_vocab_size=6000):
74
- """Performs BPE on Hindi text.
75
-
76
- Args:
77
- text (str): The Hindi text to encode.
78
- target_vocab_size (int, optional): The target vocabulary size. Defaults to 6000.
79
-
80
- Returns:
81
- tuple: A tuple containing the encoded text, vocabulary size, and compression ratio.
82
- """
83
-
84
  preprocessed_text = preprocess_text(text)
 
85
  vocab = Counter(preprocessed_text.split())
86
  vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
87
  vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
88
-
89
  bpe_codes = []
90
- while len(vocab) < target_vocab_size:
91
  pairs = get_stats(vocab)
92
  if not pairs:
93
  break
94
  best = max(pairs, key=pairs.get)
95
  vocab = merge_vocab(best, vocab)
96
  bpe_codes.append((best, pairs[best]))
97
-
98
- encoded_text = apply_bpe(preprocessed_text, bpe_codes)
99
- original_size = len(preprocessed_text)
100
- compressed_size = len(encoded_text)
101
- compression_ratio = original_size / compressed_size
102
-
103
- return encoded_text, len(vocab), compression_ratio
104
-
105
- def gradio_demo():
106
- """Creates a Gradio app for BPE in Hindi."""
107
-
108
- iface = gr.Interface(
109
- fn=bpe_process,
110
- inputs="textbox",
111
- outputs=["text", "label", "label"],
112
- title="Hindi Byte Pair Encoding (BPE)",
113
- description="Enter Hindi text and see"
114
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  )
116
 
117
- # Launch the Gradio app
118
- iface.launch(share=True)
 
3
  import gradio as gr
4
 
5
  def preprocess_text(text):
6
+ text = re.sub(r'[^\u0900-\u097F\s]', '', text)
7
+ text = ' '.join(text.split())
 
 
 
 
 
 
 
 
 
8
  return text
9
 
10
  def get_stats(vocab):
 
 
 
 
 
 
 
 
 
11
  pairs = Counter()
12
  for word, freq in vocab.items():
13
  symbols = word.split()
 
16
  return pairs
17
 
18
  def merge_vocab(pair, v_in):
 
 
 
 
 
 
 
 
 
 
19
  v_out = {}
20
  bigram = ' '.join(pair)
21
  replacement = ''.join(pair)
 
25
  return v_out
26
 
27
  def apply_bpe(text, bpe_codes):
 
 
 
 
 
 
 
 
 
 
28
  word_list = text.split()
29
  for pair, _ in bpe_codes:
30
  if ' ' in pair:
 
32
  word_list = [p.sub(''.join(pair), word) for word in word_list]
33
  return word_list
34
 
35
+ def perform_bpe(text):
 
 
 
 
 
 
 
 
 
 
36
  preprocessed_text = preprocess_text(text)
37
+
38
  vocab = Counter(preprocessed_text.split())
39
  vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
40
  vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
41
+
42
  bpe_codes = []
43
+ while True:
44
  pairs = get_stats(vocab)
45
  if not pairs:
46
  break
47
  best = max(pairs, key=pairs.get)
48
  vocab = merge_vocab(best, vocab)
49
  bpe_codes.append((best, pairs[best]))
50
+
51
+ encoded_text = apply_bpe(preprocessed_text, bpe_codes)
52
+ original_size = len(preprocessed_text)
53
+ compressed_size = len(encoded_text)
54
+ compression_ratio = original_size / compressed_size
55
+
56
+ if len(vocab) >= 5000 and compression_ratio >= 3:
57
+ break
58
+
59
+ result = f"Vocabulary size: {len(vocab)}\n"
60
+ result += f"Original size: {original_size}\n"
61
+ result += f"Compressed size: {compressed_size}\n"
62
+ result += f"Compression ratio: {compression_ratio:.2f}X\n\n"
63
+
64
+ if len(vocab) >= 5000 and compression_ratio >= 3:
65
+ result += "Both criteria are met!"
66
+ elif len(vocab) >= 5000:
67
+ result += "Vocabulary size criterion is met, but compression ratio is below 3."
68
+ elif compression_ratio >= 3:
69
+ result += "Compression ratio criterion is met, but vocabulary size is below 5000."
70
+ else:
71
+ result += "Neither criterion is met."
72
+
73
+ return result, ' '.join(encoded_text)
74
+
75
+ def bpe_app(input_text):
76
+ stats, encoded_text = perform_bpe(input_text)
77
+ return stats, encoded_text
78
+
79
+ iface = gr.Interface(
80
+ fn=bpe_app,
81
+ inputs=[
82
+ gr.Textbox(lines=5, label="Input Hindi Text")
83
+ ],
84
+ outputs=[
85
+ gr.Textbox(label="BPE Statistics"),
86
+ gr.Textbox(label="Encoded Text")
87
+ ],
88
+ title="Byte Pair Encoding (BPE) for Hindi Text",
89
+ description="Enter Hindi text to perform BPE encoding. The algorithm will continue until it reaches a vocabulary size of 5000+ tokens and a compression ratio of 3 or above."
90
  )
91
 
92
+ iface.launch()