Spaces:

sagar007
/

BPE

Sleeping

App Files Files Community

BPE / app.py

sagar007

Update app.py

f73fa89 verified about 1 year ago

raw

history blame contribute delete

6.87 kB

	import re
	from collections import Counter
	import gradio as gr

	def preprocess_text(text):
	text = re.sub(r'[^\u0900-\u097F\s]', '', text)
	text = ' '.join(text.split())
	return text

	def get_stats(vocab):
	pairs = Counter()
	for word, freq in vocab.items():
	symbols = word.split()
	for i in range(len(symbols) - 1):
	pairs[symbols[i], symbols[i + 1]] += freq
	return pairs

	def merge_vocab(pair, v_in):
	v_out = {}
	bigram = ' '.join(pair)
	replacement = ''.join(pair)
	for word in v_in:
	w_out = word.replace(bigram, replacement)
	v_out[w_out] = v_in[word]
	return v_out

	def apply_bpe(text, bpe_codes):
	word_list = text.split()
	for pair, _ in bpe_codes:
	if ' ' in pair:
	p = re.compile(r'(?<!\S)' + re.escape(' '.join(pair)) + r'(?!\S)')
	word_list = [p.sub(''.join(pair), word) for word in word_list]
	return word_list

	def perform_bpe(text):
	preprocessed_text = preprocess_text(text)
	original_size = len(preprocessed_text)
	vocab = Counter(preprocessed_text.split())
	vocab.update(Counter([preprocessed_text[i:i+2] for i in range(len(preprocessed_text)-1)]))
	vocab.update(Counter([preprocessed_text[i:i+3] for i in range(len(preprocessed_text)-2)]))
	bpe_codes = []

	while True:
	pairs = get_stats(vocab)
	if not pairs:
	break
	best = max(pairs, key=pairs.get)
	vocab = merge_vocab(best, vocab)
	bpe_codes.append((best, pairs[best]))

	encoded_text = apply_bpe(preprocessed_text, bpe_codes)
	compressed_size = len(encoded_text)
	compression_ratio = original_size / compressed_size if compressed_size > 0 else 0

	if len(vocab) >= 5000 and compression_ratio >= 3:
	break

	result = f"Vocabulary size: {len(vocab)}\n"
	result += f"Original size: {original_size}\n"
	result += f"Compressed size: {compressed_size}\n"
	result += f"Compression ratio: {compression_ratio:.2f}X\n\n"

	if len(vocab) >= 5000 and compression_ratio >= 3:
	result += "Both criteria are met!"
	elif len(vocab) >= 5000:
	result += "Vocabulary size criterion is met, but compression ratio is below 3."
	elif compression_ratio >= 3:
	result += "Compression ratio criterion is met, but vocabulary size is below 5000."
	else:
	result += "Neither criterion is met."

	return result, ' '.join(encoded_text)

	def bpe_app(input_text):
	if not input_text:
	input_text = "नमस्ते! यह एक उदाहरण हिंदी वाक्य है। आप अपना खुद का पाठ यहां दर्ज कर सकते हैं।"
	stats, encoded_text = perform_bpe(input_text)
	return stats, encoded_text

	# Custom CSS
	custom_css = """
	<style>
	body {
	font-family: 'Poppins', sans-serif;
	background: linear-gradient(135deg, #1e3c72, #2a5298);
	color: #ffffff;
	}
	.container {
	max-width: 900px;
	margin: 0 auto;
	padding: 20px;
	}
	.gradio-container {
	background-color: rgba(255, 255, 255, 0.1);
	border-radius: 15px;
	backdrop-filter: blur(10px);
	box-shadow: 0 10px 30px rgba(0, 0, 0, 0.2);
	}
	h1 {
	color: #ffffff;
	text-align: center;
	font-size: 2.5em;
	margin-bottom: 20px;
	text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
	}
	.gr-button {
	background-color: #4CAF50 !important;
	border: none !important;
	color: white !important;
	text-transform: uppercase;
	font-weight: bold;
	transition: all 0.3s ease;
	}
	.gr-button:hover {
	background-color: #45a049 !important;
	box-shadow: 0 5px 15px rgba(0,0,0,0.2);
	transform: translateY(-2px);
	}
	.gr-form {
	border-radius: 10px;
	background-color: rgba(255, 255, 255, 0.1);
	padding: 20px;
	margin-bottom: 20px;
	}
	.gr-box {
	border-radius: 8px;
	border: 1px solid rgba(255, 255, 255, 0.2);
	padding: 15px;
	margin-top: 10px;
	background-color: rgba(255, 255, 255, 0.05);
	}
	.gr-padded {
	padding: 15px;
	}
	.gr-input, .gr-textarea {
	background-color: rgba(255, 255, 255, 0.1) !important;
	border: 2px solid rgba(255, 255, 255, 0.2) !important;
	border-radius: 8px !important;
	color: white !important;
	}
	.gr-input:focus, .gr-textarea:focus {
	border-color: #4CAF50 !important;
	box-shadow: 0 0 0 2px rgba(76, 175, 80, 0.2) !important;
	}
	#component-0 {
	border-radius: 10px;
	overflow: hidden;
	}
	.footer {
	text-align: center;
	margin-top: 20px;
	font-size: 0.9em;
	color: rgba(255, 255, 255, 0.7);
	}
	</style>
	"""

	# HTML Template
	html_template = """
	<div class="container">
	<h1>🇮🇳 Byte Pair Encoding for Hindi</h1>
	<p style="text-align: center; margin-bottom: 20px; color: rgba(255, 255, 255, 0.8);">
	Compress and tokenize Hindi text using the BPE algorithm.
	Enter your text below or use one of the examples provided.
	</p>
	</div>
	"""

	# Create Gradio interface with custom theme
	with gr.Blocks(css=custom_css) as iface:
	gr.HTML(html_template)

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(lines=5, label="Input Hindi Text", placeholder="Enter Hindi text here or leave blank for an example")

	with gr.Row():
	submit_btn = gr.Button("Process", variant="primary")

	with gr.Row():
	with gr.Column():
	output_stats = gr.Textbox(label="BPE Statistics")
	with gr.Column():
	output_encoded = gr.Textbox(label="Encoded Text")

	gr.Markdown("The algorithm continues until it reaches a vocabulary size of 5000+ tokens and a compression ratio of 3 or above.")

	examples = gr.Examples(
	examples=[
	["नमस्ते दुनिया! यह एक छोटा सा उदाहरण है।"],
	["भारत एक विशाल और विविधतापूर्ण देश है, जहाँ कई भाषाएँ बोली जाती हैं।"],
	["आज का मौसम बहुत सुहावना है। आकाश में बादल छाए हुए हैं और हल्की बारिश हो रही है।"]
	],
	inputs=[input_text],
	)

	submit_btn.click(bpe_app, inputs=[input_text], outputs=[output_stats, output_encoded])

	gr.HTML('<div class="footer">Developed with ❤️ for Hindi language processing</div>')

	# Launch the app
	iface.launch(inbrowser=True, share=True)