sagar007 commited on
Commit
a1bcbda
·
verified ·
1 Parent(s): 00ea787

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -7
app.py CHANGED
@@ -3,8 +3,11 @@ import re
3
  from collections import Counter
4
 
5
  def preprocess_text(text):
 
6
  text = re.sub(r'[^\u0900-\u097F\s]', '', text)
7
- return ' '.join(text.split())
 
 
8
 
9
  def get_stats(vocab):
10
  pairs = Counter()
@@ -29,12 +32,12 @@ def apply_bpe(text, bpe_codes):
29
  if ' ' in pair:
30
  p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
31
  word_list = [p.sub(''.join(pair), word) for word in word_list]
32
- return word_list
33
 
34
  def bpe_process(input_text, target_vocab_size):
35
  preprocessed_text = preprocess_text(input_text)
36
 
37
- # Initialize vocabulary
38
  vocab = Counter(preprocessed_text.split())
39
  vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
40
  vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
@@ -53,9 +56,9 @@ def bpe_process(input_text, target_vocab_size):
53
  encoded_text = apply_bpe(preprocessed_text, bpe_codes)
54
 
55
  # Calculate compression ratio
56
- original_size = len(preprocessed_text.split())
57
  compressed_size = len(encoded_text)
58
- compression_ratio = original_size / compressed_size
59
 
60
  # Check if criteria are met
61
  criteria_met = {
@@ -64,7 +67,7 @@ def bpe_process(input_text, target_vocab_size):
64
  }
65
 
66
  return (
67
- " ".join(encoded_text),
68
  len(vocab),
69
  compression_ratio,
70
  criteria_met
@@ -88,4 +91,4 @@ iface = gr.Interface(
88
  )
89
 
90
  # Launch the Gradio app
91
- iface.launch(share=True)
 
3
  from collections import Counter
4
 
5
  def preprocess_text(text):
6
+ # Remove punctuation and special characters, keep Hindi characters and spaces
7
  text = re.sub(r'[^\u0900-\u097F\s]', '', text)
8
+ # Remove extra whitespace
9
+ text = ' '.join(text.split())
10
+ return text
11
 
12
  def get_stats(vocab):
13
  pairs = Counter()
 
32
  if ' ' in pair:
33
  p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
34
  word_list = [p.sub(''.join(pair), word) for word in word_list]
35
+ return ' '.join(word_list)
36
 
37
  def bpe_process(input_text, target_vocab_size):
38
  preprocessed_text = preprocess_text(input_text)
39
 
40
+ # Initialize vocabulary with character-level tokens and common subwords
41
  vocab = Counter(preprocessed_text.split())
42
  vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
43
  vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
 
56
  encoded_text = apply_bpe(preprocessed_text, bpe_codes)
57
 
58
  # Calculate compression ratio
59
+ original_size = len(preprocessed_text)
60
  compressed_size = len(encoded_text)
61
+ compression_ratio = original_size / compressed_size if compressed_size != 0 else 0
62
 
63
  # Check if criteria are met
64
  criteria_met = {
 
67
  }
68
 
69
  return (
70
+ encoded_text,
71
  len(vocab),
72
  compression_ratio,
73
  criteria_met
 
91
  )
92
 
93
  # Launch the Gradio app
94
+ iface.launch(share=True)