import re
import gradio as gr
from pathlib import Path
from joblib import load

# Load the classifier only once during development (reload mode)
if gr.NO_RELOAD:
    classifier = load(Path(__file__).parent / "classification_model" / "svm_char_word.joblib")


def classify_text(text: str):
    """Classify Romansh text and return prediction with probabilities."""
    if not text.strip():
        return "Please enter some text to classify."
    
    # Map predictions to readable language names
    language_names = {
        'rm-sursilv': 'Sursilvan',
        'rm-vallader': 'Vallader',
        'rm-rumgr': 'Rumantsch Grischun',
        'rm-surmiran': 'Surmiran',
        'rm-puter': 'Puter',
        'rm-sutsilv': 'Sutsilvan',
        'unknown': 'Unknown'
    }
    
    # Get class labels from the classifier
    class_labels = classifier.classes_
    
    # Try to get probabilities if available, otherwise use decision function
    try:
        probabilities = classifier.predict_proba([text])[0]
        # Create result dictionary with probabilities
        result = {}
        for i, label in enumerate(class_labels):
            readable_name = language_names.get(label, label)
            result[readable_name] = float(probabilities[i])
    except AttributeError:
        # LinearSVC doesn't have predict_proba, use decision function instead
        decision_scores = classifier.decision_function([text])[0]
        # Convert decision scores to probabilities using softmax
        import numpy as np
        exp_scores = np.exp(decision_scores - np.max(decision_scores))
        probabilities = exp_scores / np.sum(exp_scores)
        
        result = {}
        for i, label in enumerate(class_labels):
            readable_name = language_names.get(label, label)
            result[readable_name] = float(probabilities[i])
    
    return result

# Read examples from the TSV file
import pandas as pd
import os

tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv")
df = pd.read_csv(tsv_path, sep='\t')

# Create a list of examples with their idiom labels
examples_data = []
for col in df.columns:
    for sentence in df[col].dropna():
        if sentence.strip():  # Skip empty sentences
            examples_data.append((sentence, col))

# Create the examples list and labels
examples = [sentence for sentence, _ in examples_data]
example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data]

# Create the Gradio interface
with gr.Blocks(title="Romansh Idiom Classifier") as demo:
    gr.Markdown("# Romansh Idiom Classifier")
    gr.Markdown("Enter Romansh text to classify which idiom/variety it belongs to.")
    
    gr.Markdown("""
    Classification system is based on the following Bachelor's thesis: [Rumantsch Idiom Identification: Building an Automatic Language Identification System](https://seafile.ifi.uzh.ch/f/96df2a17539546e7a192/) (Charlotte Model, 2025).
    """)
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Romansh Text",
                placeholder="Enter Romansh text here...",
                lines=5,
                max_lines=10
            )
            
        with gr.Column():
            output = gr.Label(
                label="Predicted Idiom",
                num_top_classes=7
            )
    
    # Set up event handlers
    text_input.change(fn=classify_text, inputs=text_input, outputs=output)
    
    # Add examples from TSV file
    gr.Examples(
        examples=examples,
        inputs=text_input,
        label="Example Sentences",
        example_labels=example_labels,
        examples_per_page=100,
        fn=classify_text,
        outputs=output,
        run_on_click=True,
        cache_examples=True,
        cache_mode='eager',
        preload=0,
    )

if __name__ == "__main__":
    demo.launch()