khateeb_standalone

Sleeping

App Files Files Community

Bisher commited on Apr 20

Commit

2b1b4a0

verified ·

1 Parent(s): 13db942

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -284

app.py CHANGED Viewed

@@ -1,311 +1,180 @@
 import gradio as gr
-from gradio_client import Client, handle_file
 import jiwer
-import os
-import time
-import warnings
 import pyarabic.araby as araby
-import difflib # Import difflib
-# Suppress specific UserWarnings from jiwer related to empty strings
-warnings.filterwarnings("ignore", message="Reference is empty.*", category=UserWarning)
-warnings.filterwarnings("ignore", message="Hypothesis is empty.*", category=UserWarning)
-# --- Constants ---
-DIACRITIZATION_API_URL = "Bisher/CATT.diacratization"
-TRANSCRIPTION_API_URL = "Bisher/arabic_syllable_transcription"
-# Define Arabic diacritics
-# Use a try-except block in case pyarabic is not installed or fails to import
 try:
-    ARABIC_DIACRITICS = {
         araby.FATHA, araby.FATHATAN, araby.DAMMA, araby.DAMMATAN,
         araby.KASRA, araby.KASRATAN, araby.SUKUN, araby.SHADDA,
     }
-except (ImportError, NameError):
-    print("Warning: pyarabic not found or failed to import. Using fallback diacritics set.")
-    ARABIC_DIACRITICS = {'\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652'}
-# --- API Clients ---
-# Use caching or global clients to avoid re-initializing on every call
-diacritization_client = None
-transcription_client = None
-def get_diacritization_client():
-    global diacritization_client
-    if diacritization_client is None:
-        try:
-            diacritization_client = Client(DIACRITIZATION_API_URL, download_files=True)
-        except Exception as e:
-            print(f"Error initializing diacritization client: {e}")
-            return None
-    return diacritization_client
-def get_transcription_client():
-    global transcription_client
-    if transcription_client is None:
-        try:
-            transcription_client = Client(TRANSCRIPTION_API_URL, download_files=True)
-        except Exception as e:
-            print(f"Error initializing transcription client: {e}")
-            return None
-    return transcription_client
-# --- Helper Functions ---
-def diacritize_text_api(text_to_diacritize):
-    """Calls the diacritization API."""
-    if not text_to_diacritize or not text_to_diacritize.strip():
-        return "Please enter some text to diacritize.", "" # Return two values as expected by the click handler
-    client = get_diacritization_client()
-    if not client:
-        return "Error: Could not connect to the diacritization service.", ""
-    try:
-        result = client.predict(
-            model_type="Encoder-Only",
-            input_text=text_to_diacritize,
-            api_name="/predict"
-        )
-        # Ensure result is a string, handle potential None or unexpected types
-        result_str = str(result) if result is not None else "Error: Empty response from diacritization service."
-        # Return the result for both the output textbox and the state
-        return result_str, result_str
-    except Exception as e:
-        print(f"Error during diacritization API call: {e}")
-        return f"Error during diacritization: {e}", ""
-def transcribe_audio_api(audio_filepath):
-    """Calls the standard transcription API."""
-    if not audio_filepath:
-        return "Error: Please provide an audio recording or file."
-    if not os.path.exists(audio_filepath):
-        return f"Error: Audio file not found at {audio_filepath}"
-    client = get_transcription_client()
-    if not client:
-        return "Error: Could not connect to the transcription service."
-    try:
-        # Add a small delay if needed, sometimes helps with API race conditions
-        # time.sleep(0.5)
-        result = client.predict(
-            audio=handle_file(audio_filepath),
-            api_name="/predict"
-        )
-        return result[0], result[1]
-    except Exception as e:
-        print(f"Error during transcription API call: {e}")
-        return f"Error during transcription: {e}"
-def get_diacritics_sequence(text):
-    """Extracts diacritics from a string."""
-    if not isinstance(text, str):
-        return ""
-    diacritics_only = [c for c in text if c in ARABIC_DIACRITICS]
-    return ' '.join(diacritics_only)
-def calculate_metrics(reference, hypothesis):
-    """Calculates WER, DER, CER."""
-    ref = reference or ""
-    hyp = hypothesis or ""
-    # Handle cases where one or both are empty or just whitespace
-    if not ref.strip() and not hyp.strip():
-        return 0.0, 0.0, 0.0 # Both empty, 0 error
-    if not ref.strip():
-        return 1.0, 1.0, 1.0 # Reference empty, hypothesis not: Max error
-    if not hyp.strip():
-        # Hypothesis empty, reference not: Max error (though jiwer might handle this)
-         # Let jiwer calculate based on its rules for empty hypothesis
-         pass
-    try:
-        # WER
-        wer = jiwer.wer(ref, hyp)
-        # DER
-        ref_d = get_diacritics_sequence(ref)
-        hyp_d = get_diacritics_sequence(hyp)
-        # Handle empty diacritic sequences for DER calculation
-        if not ref_d.strip() and not hyp_d.strip():
-            der = 0.0
-        elif not ref_d.strip():
-            der = 1.0
-        else:
-            der = jiwer.wer(ref_d, hyp_d) # jiwer handles empty hyp_d if ref_d is not empty
-        # CER
-        cer = jiwer.cer(ref, hyp)
-        return round(wer, 4), round(der, 4), round(cer, 4)
-    except Exception as e:
-        print(f"Error calculating metrics: {e}")
-        return None, None, None # Indicate error in calculation
-def highlight_errors(reference, hypothesis):
-    """Highlights differences between reference and hypothesis using HTML mark tag."""
-    ref = reference or ""
-    hyp = hypothesis or ""
-    ref_words = ref.split()
-    hyp_words = hyp.split()
-    if not ref_words and not hyp_words:
-        return "", "" # No errors if both are empty
-    matcher = difflib.SequenceMatcher(None, ref_words, hyp_words, autojunk=False)
-    highlighted_hyp_words = []
-    error_words_ref = [] # Words in reference that were deleted or replaced
-    error_words_hyp = [] # Words in hypothesis that were inserted or replaced
     for tag, i1, i2, j1, j2 in matcher.get_opcodes():
         if tag == 'equal':
-            highlighted_hyp_words.extend(hyp_words[j1:j2])
         elif tag == 'replace':
-            # Mark incorrect words in hypothesis red
-            for word in hyp_words[j1:j2]:
-                 highlighted_hyp_words.append(f"<mark style='background-color: #ffcccb;'>{word}</mark>")
-            error_words_ref.extend(ref_words[i1:i2])
-            error_words_hyp.extend(hyp_words[j1:j2])
         elif tag == 'delete':
-            # Indicate missing words (maybe with a placeholder?) - for now, just note them
-            # We don't add anything to highlighted_hyp_words here as they are missing
-             error_words_ref.extend(ref_words[i1:i2])
-             # Optionally add a placeholder in the output to show where deletion happened
-             # highlighted_hyp_words.append("<mark style='background-color: #lightgrey;'>[missing]</mark>")
         elif tag == 'insert':
-             # Mark inserted words in hypothesis green
-            for word in hyp_words[j1:j2]:
-                 highlighted_hyp_words.append(f"<mark style='background-color: #ccffcc;'>{word}</mark>")
-            error_words_hyp.extend(hyp_words[j1:j2])
-    html_output = ' '.join(highlighted_hyp_words)
-    # Combine unique error words for the list
-    error_list = sorted(list(set(error_words_ref + error_words_hyp)))
-    return html_output, ', '.join(error_list)
-# --- Gradio Interface ---
 with gr.Blocks(theme=gr.themes.Soft()) as app:
-    gr.Markdown(
-        """
-        # Arabic Diacritization and Reading Assessment Tool
-        1.  Enter undiacritized Arabic text and click **Diacritize Text**.
-        2.  Read the generated **Diacritized Text** aloud and record or upload audio.
-        3.  Click **Transcribe and Compare** to see the transcript, syllable transcript, WER/DER/CER, and mispronounced words highlighted.
-        """
-    )
-    # Using gr.State to hold the diacritized reference text between steps
-    reference_text_state = gr.State("")
     with gr.Row():
         with gr.Column(scale=1):
-            text_input = gr.Textbox(label="Undiacritized Arabic Text", lines=3, text_align="right")
-            diacritize_btn = gr.Button("Diacritize Text")
-            diacritized_output = gr.Textbox(
-                label="Diacritized Text (Reference)",
-                lines=3,
-                interactive=True, # User shouldn't edit this directly
-                text_align="right",
-            )
-            diacritized_output.change(
-                fn=lambda text: text,
-                inputs=diacritized_output,
-                outputs=reference_text_state
-            )
         with gr.Column(scale=1):
-            audio_input = gr.Audio(label="Record or Upload Audio", type="filepath", sources=["microphone", "upload"])
-            transcribe_btn = gr.Button("Transcribe and Compare")
-            transcript_output = gr.Textbox(
-                label="Transcript (Hypothesis)",
-                lines=3,
-                interactive=False,
-                text_align="right"
-            )
-            # Ensure this Textbox is defined correctly
-            transcript_syllables_output = gr.Textbox(
-                label="Transcript Syllables (Hypothesis)", # Corrected label slightly for clarity
-                lines=3,
-                interactive=False,
-                text_align="right"
             )
-            with gr.Row():
-                wer_out = gr.Number(label="WER", interactive=False, precision=4)
-                der_out = gr.Number(label="DER", interactive=False, precision=4)
-                cer_out = gr.Number(label="CER", interactive=False, precision=4)
-            # Use Markdown for potentially richer HTML display if needed, but HTML component is fine
-            error_html = gr.HTML(label="Highlighted Errors in Hypothesis")
-            error_list = gr.Textbox(label="Words Involved in Errors", interactive=False) # Changed label
-    # --- Event Handlers ---
-    # When Diacritize button is clicked
-    diacritize_btn.click(
-        fn=diacritize_text_api,
-        inputs=[text_input],
-        # Output to the display box AND the hidden state
-        outputs=[diacritized_output, reference_text_state]
-    )
-    # Define the main processing function that returns all 7 values
-    def process_audio_and_compare(audio_filepath, reference_text):
-        """Processes audio, gets both transcripts, calculates metrics, and highlights errors."""
-        # Default values in case of errors
-        transcript = "Error: Processing failed."
-        syllable_transcript = "Error: Processing failed."
-        wer, der, cer = None, None, None
-        html_output = ""
-        error_words = ""
-        # Validate inputs
-        if not audio_filepath:
-            transcript = "Error: No audio provided."
-            syllable_transcript = "Error: No audio provided."
-             # Return 7 values even on input error
-            return transcript, syllable_transcript, None, None, None, "", ""
-        if not reference_text:
-             transcript = "Error: No reference text found. Please diacritize first."
-             syllable_transcript = "Error: No reference text found."
-             # Return 7 values
-             return transcript, syllable_transcript, None, None, None, "", ""
-        try:
-            # --- Call Transcription APIs ---
-            transcript, syllable_transcript = transcribe_audio_api(audio_filepath)
-        except:
-            print(f"Error calculating metrics: {e}")
-            transcript, syllable_transcript = "error", "error"
-        # --- Calculate Metrics and Highlight Errors (only if first transcript is not an error) ---
-        if not transcript.startswith("Error"):
-            wer, der, cer = calculate_metrics(reference_text, transcript)
-            # Use the standard transcript for highlighting, adjust if needed
-            html_output, error_words = highlight_errors(reference_text, transcript)
-        else:
-            # If the main transcript failed, indicate no metrics/highlighting possible
-            wer, der, cer = None, None, None
-            html_output = "Highlighting not available due to transcription error."
-            error_words = "N/A"
-        # --- Return all 7 values ---
-        return transcript, syllable_transcript, wer, der, cer, html_output, error_words
-    # When Transcribe button is clicked
-    transcribe_btn.click(
-        fn=process_audio_and_compare,
-        # Get audio path and the reference text from the state
-        inputs=[audio_input, reference_text_state],
-        # Update all 7 output components
-        outputs=[
-            transcript_output,
-            transcript_syllables_output, # This should now update correctly
-            wer_out,
-            der_out,
-            cer_out,
-            error_html,
-            error_list
-        ]
-    )
-# Launch the app
 if __name__ == "__main__":
-    app.launch(debug=True, ssr_mode=False) # Set share=True if you need a public link

+import os
+import sys
+import urllib.request
+import torch
 import gradio as gr
 import jiwer
+import difflib
 import pyarabic.araby as araby
+from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
+# ---------- Setup: Clone CATT repo & download diacritization models ----------
+CATT_REPO_URL = "https://github.com/abjadai/catt.git"
+CATT_FOLDER = "catt"
+MODELS_DIR = "models"
+ED_URL = "https://github.com/abjadai/catt/releases/download/v2/best_ed_mlm_ns_epoch_178.pt"
+EO_URL = "https://github.com/abjadai/catt/releases/download/v2/best_eo_mlm_ns_epoch_193.pt"
+os.makedirs(MODELS_DIR, exist_ok=True)
+# Clone if needed
+if not os.path.isdir(CATT_FOLDER):
+    os.system(f"git clone {CATT_REPO_URL}")
+if CATT_FOLDER not in sys.path:
+    sys.path.append(CATT_FOLDER)
+# Download checkpoints
+for url in (ED_URL, EO_URL):
+    fname = os.path.basename(url)
+    dest = os.path.join(MODELS_DIR, fname)
+    if not os.path.isfile(dest):
+        urllib.request.urlretrieve(url, dest)
+# Import CATT modules
+from tashkeel_tokenizer import TashkeelTokenizer
+from utils import remove_non_arabic
+from ed_pl import TashkeelModel as TashkeelModel_ED
+from eo_pl import TashkeelModel as TashkeelModel_EO
+# Prepare tokenizer & device
+tokenizer = TashkeelTokenizer()
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load diacritization models
+def load_diacritization_models():
+    global model_ed, model_eo
+    max_seq_len = 1024
+    model_ed = TashkeelModel_ED(tokenizer, max_seq_len=max_seq_len, n_layers=3, learnable_pos_emb=False)
+    model_ed.load_state_dict(torch.load(os.path.join(MODELS_DIR, os.path.basename(ED_URL)), map_location=device))
+    model_ed.eval().to(device)
+    model_eo = TashkeelModel_EO(tokenizer, max_seq_len=max_seq_len, n_layers=6, learnable_pos_emb=False)
+    model_eo.load_state_dict(torch.load(os.path.join(MODELS_DIR, os.path.basename(EO_URL)), map_location=device))
+    model_eo.eval().to(device)
+load_diacritization_models()
+# ---------- Setup: Arabic syllable transcription pipelines ----------
+ASR_PIPE = pipeline("automatic-speech-recognition", model="IbrahimSalah/Arabic_speech_Syllables_recognition_Using_Wav2vec2")
+MT5_MODEL = AutoModelForSeq2SeqLM.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5")
+MT5_TOKENIZER = AutoTokenizer.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5")
+MT5_MODEL.eval()
+# Arabic diacritics set
 try:
+    DIACRITICS = {
         araby.FATHA, araby.FATHATAN, araby.DAMMA, araby.DAMMATAN,
         araby.KASRA, araby.KASRATAN, araby.SUKUN, araby.SHADDA,
     }
+except:
+    DIACRITICS = {'\u064B','\u064C','\u064D','\u064E','\u064F','\u0650','\u0651','\u0652'}
+# ---------- Core Functions ----------
+def diacritize_text(model_type, input_text):
+    text_clean = remove_non_arabic(input_text.strip())
+    if not text_clean:
+        return "Please enter some Arabic text."
+    x = [text_clean]
+    if model_type == "Encoder-Decoder":
+        out = model_ed.do_tashkeel_batch(x, batch_size=16, verbose=False)
+    else:
+        out = model_eo.do_tashkeel_batch(x, batch_size=16, verbose=False)
+    return out[0] if out else ""
+def get_and_process_syllables(audio_path):
+    # ASR -> syllable sequence -> MT5 conversion
+    clip = ASR_PIPE(audio_path)["text"]
+    seq = "|" + clip.replace(" ", "|") + "."
+    input_ids = MT5_TOKENIZER.encode(seq, return_tensors="pt")
+    out_ids = MT5_MODEL.generate(
+        input_ids,
+        max_length=100,
+        early_stopping=True,
+        pad_token_id=MT5_TOKENIZER.pad_token_id,
+        bos_token_id=MT5_TOKENIZER.bos_token_id,
+        eos_token_id=MT5_TOKENIZER.eos_token_id,
+    )
+    text = MT5_TOKENIZER.decode(out_ids[0][1:], skip_special_tokens=True).split('.')[0]
+    return text, seq
+def get_diacritics_sequence(txt):
+    return ' '.join([c for c in txt if c in DIACRITICS])
+def calculate_metrics(ref, hyp):
+    if not ref.strip() and not hyp.strip(): return 0.0, 0.0, 0.0
+    if not ref.strip(): return 1.0, 1.0, 1.0
+    wer = jiwer.wer(ref, hyp)
+    ref_d, hyp_d = get_diacritics_sequence(ref), get_diacritics_sequence(hyp)
+    der = 0.0 if (not ref_d and not hyp_d) else (1.0 if not ref_d else jiwer.wer(ref_d, hyp_d))
+    cer = jiwer.cer(ref, hyp)
+    return round(wer,4), round(der,4), round(cer,4)
+def highlight_errors(ref, hyp):
+    ref_w, hyp_w = ref.split(), hyp.split()
+    matcher = difflib.SequenceMatcher(None, ref_w, hyp_w, autojunk=False)
+    out_words, errs = [], []
     for tag, i1, i2, j1, j2 in matcher.get_opcodes():
         if tag == 'equal':
+            out_words.extend(hyp_w[j1:j2])
         elif tag == 'replace':
+            for w in hyp_w[j1:j2]: out_words.append(f"<mark style='background-color:#ffcccb;'>{w}</mark>")
+            errs.extend(ref_w[i1:i2] + hyp_w[j1:j2])
         elif tag == 'delete':
+            errs.extend(ref_w[i1:i2])
         elif tag == 'insert':
+            for w in hyp_w[j1:j2]: out_words.append(f"<mark style='background-color:#ccffcc;'>{w}</mark>")
+            errs.extend(hyp_w[j1:j2])
+    return ' '.join(out_words), ', '.join(sorted(set(errs)))
+def process_audio_and_compare(audio_path, reference_text):
+    if not audio_path:
+        return *("Error: No audio provided.",)*2, None, None, None, "", ""
+    if not reference_text.strip():
+        return *("Error: No reference text.",)*2, None, None, None, "", ""
+    hyp, syll = get_and_process_syllables(audio_path)
+    wer, der, cer = calculate_metrics(reference_text, hyp) if not hyp.startswith("Error") else (None,None,None)
+    html_out, errs = highlight_errors(reference_text, hyp) if not hyp.startswith("Error") else ("", "")
+    return hyp, syll, wer, der, cer, html_out, errs
+# ---------- Gradio Interface ----------
 with gr.Blocks(theme=gr.themes.Soft()) as app:
+    gr.Markdown("""
+    # Arabic Diacritization & Reading Assessment
+    1. Enter undiacritized Arabic text → Diacritize.
+    2. Read aloud & record/upload audio → Transcribe & Compare.
+    """)
+    ref_state = gr.State("")
     with gr.Row():
         with gr.Column(scale=1):
+            text_in = gr.Textbox(label="Undiacritized Arabic Text", lines=3, text_align="right")
+            model_sel = gr.Dropdown(choices=["Encoder-Only","Encoder-Decoder"], value="Encoder-Only", label="Model")
+            diac_btn = gr.Button("Diacritize Text")
+            diac_out = gr.Textbox(label="Diacritized Text (Reference)", lines=3, text_align="right")
+            diac_btn.click(fn=diacritize_text, inputs=[model_sel, text_in], outputs=[diac_out, ref_state])
         with gr.Column(scale=1):
+            audio_in = gr.Audio(label="Record/Upload Audio", type="filepath")
+            trans_btn = gr.Button("Transcribe & Compare")
+            hyp_out = gr.Textbox(label="Transcript (Hypothesis)", lines=3, text_align="right")
+            syl_out = gr.Textbox(label="Transcript Syllables", lines=3, text_align="right")
+            wer_n = gr.Number(label="WER", precision=4)
+            der_n = gr.Number(label="DER", precision=4)
+            cer_n = gr.Number(label="CER", precision=4)
+            err_html = gr.HTML(label="Highlighted Errors")
+            err_list = gr.Textbox(label="Error Words")
+            trans_btn.click(
+                fn=process_audio_and_compare,
+                inputs=[audio_in, ref_state],
+                outputs=[hyp_out, syl_out, wer_n, der_n, cer_n, err_html, err_list]
             )
+# Launch
 if __name__ == "__main__":
+    app.launch(debug=True)