Spaces:

sagar007
/

BPE

Build error

App Files Files Community

BPE / app.py

sagar007

Update app.py

00ea787 verified over 1 year ago

raw

history blame

2.9 kB

	import gradio as gr
	import re
	from collections import Counter

	def preprocess_text(text):
	text = re.sub(r'[^\u0900-\u097F\s]', '', text)
	return ' '.join(text.split())

	def get_stats(vocab):
	pairs = Counter()
	for word, freq in vocab.items():
	symbols = word.split()
	for i in range(len(symbols) - 1):
	pairs[symbols[i], symbols[i + 1]] += freq
	return pairs

	def merge_vocab(pair, v_in):
	v_out = {}
	bigram = ' '.join(pair)
	replacement = ''.join(pair)
	for word in v_in:
	w_out = word.replace(bigram, replacement)
	v_out[w_out] = v_in[word]
	return v_out

	def apply_bpe(text, bpe_codes):
	word_list = text.split()
	for pair, _ in bpe_codes:
	if ' ' in pair:
	p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
	word_list = [p.sub(''.join(pair), word) for word in word_list]
	return word_list

	def bpe_process(input_text, target_vocab_size):
	preprocessed_text = preprocess_text(input_text)

	# Initialize vocabulary
	vocab = Counter(preprocessed_text.split())
	vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
	vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))

	# Perform BPE merges
	bpe_codes = []
	while len(vocab) < target_vocab_size:
	pairs = get_stats(vocab)
	if not pairs:
	break
	best = max(pairs, key=pairs.get)
	vocab = merge_vocab(best, vocab)
	bpe_codes.append((best, pairs[best]))

	# Apply BPE to the original text
	encoded_text = apply_bpe(preprocessed_text, bpe_codes)

	# Calculate compression ratio
	original_size = len(preprocessed_text.split())
	compressed_size = len(encoded_text)
	compression_ratio = original_size / compressed_size

	# Check if criteria are met
	criteria_met = {
	"vocab_size_met": len(vocab) >= 5000,
	"compression_ratio_met": compression_ratio >= 3
	}

	return (
	" ".join(encoded_text),
	len(vocab),
	compression_ratio,
	criteria_met
	)

	# Define the Gradio interface
	iface = gr.Interface(
	fn=bpe_process,
	inputs=[
	gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here..."),
	gr.Slider(minimum=1000, maximum=10000, step=100, value=6000, label="Target Vocabulary Size")
	],
	outputs=[
	gr.Textbox(label="Encoded Text"),
	gr.Number(label="Vocabulary Size"),
	gr.Number(label="Compression Ratio"),
	gr.JSON(label="Criteria Met")
	],
	title="Byte Pair Encoding (BPE) for Hindi",
	description="Encode Hindi text using Byte Pair Encoding. Set the target vocabulary size and see the encoded output along with vocabulary size and compression ratio."
	)

	# Launch the Gradio app
	iface.launch(share=True)