sagar007 commited on
Commit
9736ac2
·
verified ·
1 Parent(s): 20f38a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -58
app.py CHANGED
@@ -1,15 +1,31 @@
1
- import gradio as gr
2
  import re
3
  from collections import Counter
 
4
 
5
  def preprocess_text(text):
6
- # Remove punctuation and special characters, keep Hindi characters and spaces
7
- text = re.sub(r'[^\u0900-\u097F\s]', '', text)
8
- # Remove extra whitespace
9
- text = ' '.join(text.split())
 
 
 
 
 
 
 
10
  return text
11
 
12
  def get_stats(vocab):
 
 
 
 
 
 
 
 
 
13
  pairs = Counter()
14
  for word, freq in vocab.items():
15
  symbols = word.split()
@@ -18,75 +34,80 @@ def get_stats(vocab):
18
  return pairs
19
 
20
  def merge_vocab(pair, v_in):
 
 
 
 
 
 
 
 
 
 
21
  v_out = {}
22
  bigram = ' '.join(pair)
23
  replacement = ''.join(pair)
24
  for word in v_in:
25
- # Use regex to ensure whole-word replacement
26
- w_out = re.sub(r'(?<!\S)' + re.escape(bigram) + r'(?!\S)', replacement, word)
27
  v_out[w_out] = v_in[word]
28
  return v_out
29
 
30
  def apply_bpe(text, bpe_codes):
 
 
 
 
 
 
 
 
 
 
31
  word_list = text.split()
32
  for pair, _ in bpe_codes:
33
- p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
34
- word_list = [p.sub(''.join(pair), word) for word in word_list]
35
- return ' '.join(word_list)
36
-
37
- def bpe_process(input_text, target_vocab_size):
38
- preprocessed_text = preprocess_text(input_text)
39
-
40
- # Initialize vocabulary
 
 
 
 
 
 
 
 
 
41
  vocab = Counter(preprocessed_text.split())
42
-
43
- # Perform BPE merges
 
44
  bpe_codes = []
45
- while len(vocab) < target_vocab_size and len(vocab) > 1:
46
  pairs = get_stats(vocab)
47
  if not pairs:
48
  break
49
  best = max(pairs, key=pairs.get)
50
  vocab = merge_vocab(best, vocab)
51
  bpe_codes.append((best, pairs[best]))
52
-
53
- # Apply BPE to the original text
54
  encoded_text = apply_bpe(preprocessed_text, bpe_codes)
55
-
56
- # Calculate compression ratio
57
- original_size = len(preprocessed_text.split())
58
- compressed_size = len(encoded_text.split())
59
- compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
60
-
61
- # Check if criteria are met
62
- criteria_met = {
63
- "vocab_size_met": len(vocab) >= 5000,
64
- "compression_ratio_met": compression_ratio >= 3
65
- }
66
-
67
- return (
68
- encoded_text,
69
- len(vocab),
70
- compression_ratio,
71
- criteria_met
72
- )
73
-
74
- # Define the Gradio interface
75
- iface = gr.Interface(
76
- fn=bpe_process,
77
- inputs=[
78
- gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here..."),
79
- gr.Slider(minimum=1000, maximum=10000, step=100, value=6000, label="Target Vocabulary Size")
80
- ],
81
- outputs=[
82
- gr.Textbox(label="Encoded Text"),
83
- gr.Number(label="Vocabulary Size"),
84
- gr.Number(label="Compression Ratio"),
85
- gr.JSON(label="Criteria Met")
86
- ],
87
- title="Byte Pair Encoding (BPE) for Hindi",
88
- description="Encode Hindi text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio."
89
- )
90
-
91
- # Launch the Gradio app
92
- iface.launch(share=True)
 
 
1
  import re
2
  from collections import Counter
3
+ import gradio as gr
4
 
5
  def preprocess_text(text):
6
+ """Preprocesses Hindi text for BPE.
7
+
8
+ Args:
9
+ text (str): The Hindi text to preprocess.
10
+
11
+ Returns:
12
+ str: The preprocessed text.
13
+ """
14
+
15
+ text = re.sub(r'[^\u0900-\u097F\s]', '', text) # Remove punctuation and special characters
16
+ text = ' '.join(text.split()) # Remove extra whitespace
17
  return text
18
 
19
  def get_stats(vocab):
20
+ """Gets bigram statistics for BPE merging.
21
+
22
+ Args:
23
+ vocab (Counter): The vocabulary of word frequencies.
24
+
25
+ Returns:
26
+ Counter: A counter of bigram frequencies.
27
+ """
28
+
29
  pairs = Counter()
30
  for word, freq in vocab.items():
31
  symbols = word.split()
 
34
  return pairs
35
 
36
  def merge_vocab(pair, v_in):
37
+ """Merges bigrams into single tokens in the vocabulary.
38
+
39
+ Args:
40
+ pair (tuple): The bigram to merge (word1, word2).
41
+ v_in (Counter): The input vocabulary.
42
+
43
+ Returns:
44
+ Counter: The updated vocabulary with the merged bigram.
45
+ """
46
+
47
  v_out = {}
48
  bigram = ' '.join(pair)
49
  replacement = ''.join(pair)
50
  for word in v_in:
51
+ w_out = word.replace(bigram, replacement)
 
52
  v_out[w_out] = v_in[word]
53
  return v_out
54
 
55
  def apply_bpe(text, bpe_codes):
56
+ """Applies BPE to a preprocessed text.
57
+
58
+ Args:
59
+ text (str): The preprocessed text.
60
+ bpe_codes (list): A list of bigram pairs for merging.
61
+
62
+ Returns:
63
+ list: The encoded text as a list of tokens.
64
+ """
65
+
66
  word_list = text.split()
67
  for pair, _ in bpe_codes:
68
+ if ' ' in pair:
69
+ p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
70
+ word_list = [p.sub(''.join(pair), word) for word in word_list]
71
+ return word_list
72
+
73
+ def bpe_process(text, target_vocab_size=6000):
74
+ """Performs BPE on Hindi text.
75
+
76
+ Args:
77
+ text (str): The Hindi text to encode.
78
+ target_vocab_size (int, optional): The target vocabulary size. Defaults to 6000.
79
+
80
+ Returns:
81
+ tuple: A tuple containing the encoded text, vocabulary size, and compression ratio.
82
+ """
83
+
84
+ preprocessed_text = preprocess_text(text)
85
  vocab = Counter(preprocessed_text.split())
86
+ vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
87
+ vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
88
+
89
  bpe_codes = []
90
+ while len(vocab) < target_vocab_size:
91
  pairs = get_stats(vocab)
92
  if not pairs:
93
  break
94
  best = max(pairs, key=pairs.get)
95
  vocab = merge_vocab(best, vocab)
96
  bpe_codes.append((best, pairs[best]))
97
+
 
98
  encoded_text = apply_bpe(preprocessed_text, bpe_codes)
99
+ original_size = len(preprocessed_text)
100
+ compressed_size = len(encoded_text)
101
+ compression_ratio = original_size / compressed_size
102
+
103
+ return encoded_text, len(vocab), compression_ratio
104
+
105
+ def gradio_demo():
106
+ """Creates a Gradio app for BPE in Hindi."""
107
+
108
+ iface = gr.Interface(
109
+ fn=bpe_process,
110
+ inputs="textbox",
111
+ outputs=["text", "label", "label"],
112
+ title="Hindi Byte Pair Encoding (BPE)",
113
+ description="Enter Hindi text and see