import re import gradio as gr from pathlib import Path from joblib import load # Load the classifier only once during development (reload mode) if gr.NO_RELOAD: classifier = load(Path(__file__).parent / "classification_model" / "svm_char_word.joblib") def classify_text(text: str): """Classify Romansh text and return prediction with probabilities.""" if not text.strip(): return "Please enter some text to classify." # Map predictions to readable language names language_names = { 'rm-sursilv': 'Sursilvan', 'rm-vallader': 'Vallader', 'rm-rumgr': 'Rumantsch Grischun', 'rm-surmiran': 'Surmiran', 'rm-puter': 'Puter', 'rm-sutsilv': 'Sutsilvan', 'unknown': 'Unknown' } # Get class labels from the classifier class_labels = classifier.classes_ # Try to get probabilities if available, otherwise use decision function try: probabilities = classifier.predict_proba([text])[0] # Create result dictionary with probabilities result = {} for i, label in enumerate(class_labels): readable_name = language_names.get(label, label) result[readable_name] = float(probabilities[i]) except AttributeError: # LinearSVC doesn't have predict_proba, use decision function instead decision_scores = classifier.decision_function([text])[0] # Convert decision scores to probabilities using softmax import numpy as np exp_scores = np.exp(decision_scores - np.max(decision_scores)) probabilities = exp_scores / np.sum(exp_scores) result = {} for i, label in enumerate(class_labels): readable_name = language_names.get(label, label) result[readable_name] = float(probabilities[i]) return result # Read examples from the TSV file import pandas as pd import os tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv") df = pd.read_csv(tsv_path, sep='\t') # Create a list of examples with their idiom labels examples_data = [] for col in df.columns: for sentence in df[col].dropna(): if sentence.strip(): # Skip empty sentences examples_data.append((sentence, col)) # Create the examples list and labels examples = [sentence for sentence, _ in examples_data] example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data] # Create the Gradio interface with gr.Blocks(title="Romansh Idiom Classifier") as demo: gr.Markdown("# Romansh Idiom Classifier") gr.Markdown("Enter Romansh text to classify which idiom/variety it belongs to.") gr.Markdown(""" Classification system is based on the following Bachelor's thesis: [Rumantsch Idiom Identification: Building an Automatic Language Identification System](https://seafile.ifi.uzh.ch/f/96df2a17539546e7a192/) (Charlotte Model, 2025). """) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Romansh Text", placeholder="Enter Romansh text here...", lines=5, max_lines=10 ) with gr.Column(): output = gr.Label( label="Predicted Idiom", num_top_classes=7 ) # Set up event handlers text_input.change(fn=classify_text, inputs=text_input, outputs=output) # Add examples from TSV file gr.Examples( examples=examples, inputs=text_input, label="Example Sentences", example_labels=example_labels, examples_per_page=100, fn=classify_text, outputs=output, run_on_click=True, cache_examples=True, cache_mode='eager', preload=0, ) if __name__ == "__main__": demo.launch()