Spaces:

jvamvas
/

romansh-idiom-classification

Running

App Files Files Community

romansh-idiom-classification / app.py

jvamvas

Implement app

1b40346 27 days ago

raw

history blame

3.63 kB

	import re
	import gradio as gr
	from pathlib import Path
	from joblib import load

	# Load the classifier only once during development (reload mode)
	if gr.NO_RELOAD:
	classifier = load(Path(__file__).parent / "classification_model" / "svm_char_word.joblib")


	def classify_text(text: str):
	"""Classify Romansh text and return prediction with probabilities."""
	if not text.strip():
	return "Please enter some text to classify."

	# Map predictions to readable language names
	language_names = {
	'rm-sursilv': 'Sursilvan',
	'rm-vallader': 'Vallader',
	'rm-rumgr': 'Rumantsch Grischun',
	'rm-surmiran': 'Surmiran',
	'rm-puter': 'Puter',
	'rm-sutsilv': 'Sutsilvan',
	'unknown': 'Unknown'
	}

	# Get class labels from the classifier
	class_labels = classifier.classes_

	# Try to get probabilities if available, otherwise use decision function
	try:
	probabilities = classifier.predict_proba([text])[0]
	# Create result dictionary with probabilities
	result = {}
	for i, label in enumerate(class_labels):
	readable_name = language_names.get(label, label)
	result[readable_name] = float(probabilities[i])
	except AttributeError:
	# LinearSVC doesn't have predict_proba, use decision function instead
	decision_scores = classifier.decision_function([text])[0]
	# Convert decision scores to probabilities using softmax
	import numpy as np
	exp_scores = np.exp(decision_scores - np.max(decision_scores))
	probabilities = exp_scores / np.sum(exp_scores)

	result = {}
	for i, label in enumerate(class_labels):
	readable_name = language_names.get(label, label)
	result[readable_name] = float(probabilities[i])

	return result

	# Read examples from the TSV file
	import pandas as pd
	import os

	tsv_path = os.path.join(os.path.dirname(__file__), "..", "lemmatizer", "demo", "example_sentences.tsv")
	df = pd.read_csv(tsv_path, sep='\t')

	# Create a list of examples with their idiom labels
	examples_data = []
	for col in df.columns:
	for sentence in df[col].dropna():
	if sentence.strip(): # Skip empty sentences
	examples_data.append((sentence, col))

	# Create the examples list and labels
	examples = [sentence for sentence, _ in examples_data]
	example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data]

	# Create the Gradio interface
	with gr.Blocks(title="Romansh Idiom Classifier") as demo:
	gr.Markdown("# Romansh Idiom Classifier")
	gr.Markdown("Enter Romansh text to classify which idiom/variety it belongs to.")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Romansh Text",
	placeholder="Enter Romansh text here...",
	lines=5,
	max_lines=10
	)

	with gr.Column():
	output = gr.Label(
	label="Predicted Idiom",
	num_top_classes=7
	)

	# Set up event handlers
	text_input.change(fn=classify_text, inputs=text_input, outputs=output)

	# Add examples from TSV file
	gr.Examples(
	examples=examples,
	inputs=text_input,
	label="Example Sentences",
	example_labels=example_labels,
	examples_per_page=100,
	fn=classify_text,
	outputs=output,
	run_on_click=True,
	cache_examples=True,
	cache_mode='eager',
	preload=0,
	)

	if __name__ == "__main__":
	demo.launch()