Spaces:

ABAO77
/

Run_code_api

Sleeping

App Files Files Community

ABAO77 commited on Sep 1

Commit

cef1d4a

1 Parent(s): 1a5420f

feat: implement Wav2Vec2 character-level ASR with ONNX and Transformers support, add phoneme comparison and feedback generation

Browse files

Files changed (2) hide show

src/apis/controllers/speaking_controller.py +139 -43
src/utils/speaking_utils.py +556 -0

src/apis/controllers/speaking_controller.py CHANGED Viewed

@@ -121,58 +121,91 @@ class WhisperASR:
         }
-class Wav2Vec2CharacterASRONNX:
-    """Wav2Vec2 character-level ASR with ONNX runtime - no language model correction"""
     def __init__(
         self,
         onnx_model_path: str = "./wav2vec2_asr.onnx",
-        processor_name: str = "facebook/wav2vec2-base-960h",
     ):
         """
-        Initialize Wav2Vec2 ONNX character-level model
-        Automatically creates ONNX model if it doesn't exist
         Args:
-            onnx_model_path: Path to the ONNX model file
-            processor_name: HuggingFace model name for the processor
         """
-        print(f"Loading Wav2Vec2 ONNX model from: {onnx_model_path}")
-        print(f"Loading processor: {processor_name}")
         # Check if ONNX model exists, if not create it
-        if not os.path.exists(onnx_model_path):
-            print(f"ONNX model not found at {onnx_model_path}. Creating it...")
-            self._create_onnx_model(onnx_model_path, processor_name)
         try:
             # Load ONNX model
-            self.session = onnxruntime.InferenceSession(onnx_model_path)
             self.input_name = self.session.get_inputs()[0].name
             self.output_name = self.session.get_outputs()[0].name
             # Load processor
-            self.processor = Wav2Vec2Processor.from_pretrained(processor_name)
             print("ONNX Wav2Vec2 character model loaded successfully")
-            self.model_name = processor_name
-            self.onnx_path = onnx_model_path
-            self.sample_rate = 16000
         except Exception as e:
             print(f"Error loading ONNX model: {e}")
             raise
-    def _create_onnx_model(self, onnx_model_path: str, processor_name: str):
         """Create ONNX model if it doesn't exist"""
         try:
             # Import the converter from model_convert
             from src.model_convert.wav2vec2onnx import Wav2Vec2ONNXConverter
             print("Creating new ONNX model...")
-            converter = Wav2Vec2ONNXConverter(processor_name)
             created_path = converter.convert_to_onnx(
-                onnx_path=onnx_model_path,
                 input_length=160000,  # 10 seconds
                 opset_version=14,
             )
@@ -184,9 +217,16 @@ class Wav2Vec2CharacterASRONNX:
     def transcribe_to_characters(self, audio_path: str) -> Dict:
         """
-        Transcribe audio directly to characters using ONNX model (no language model correction)
         Returns raw character sequence as produced by the model
         """
         try:
             # Load audio
             start_time = time.time()
@@ -233,14 +273,56 @@ class Wav2Vec2CharacterASRONNX:
             }
         except Exception as e:
-            print(f"Transcription error: {e}")
             return {
-                "character_transcript": "",
-                "phoneme_representation": "",
-                "raw_predicted_ids": [],
-                "confidence_scores": [],
             }
     def _calculate_confidence_scores(self, logits: np.ndarray) -> List[float]:
         """Calculate confidence scores from logits using numpy"""
         # Apply softmax
@@ -257,27 +339,23 @@ class Wav2Vec2CharacterASRONNX:
         logger.info(f"Raw transcript before cleaning: {transcript}")
         cleaned = re.sub(r"\s+", " ", transcript)
         cleaned = cleaned.strip().lower()
         return cleaned
     def _characters_to_phoneme_representation(self, text: str) -> str:
         """Convert character-based transcript to phoneme-like representation for comparison"""
-        # This is a simple character-to-phoneme mapping for pronunciation comparison
-        # The idea is to convert the raw character output to something comparable with reference phonemes
         if not text:
             return ""
         words = text.split()
         phoneme_words = []
-        # Use our G2P to convert transcript words to phonemes
         g2p = SimpleG2P()
         for word in words:
             try:
-                word_data = g2p.text_to_phonemes(word)[0]
-                phoneme_words.extend(word_data["phonemes"])
             except:
                 # Fallback: simple letter-to-sound mapping
                 phoneme_words.extend(self._simple_letter_to_phoneme(word))
@@ -322,17 +400,35 @@ class Wav2Vec2CharacterASRONNX:
         return phonemes
-    def get_model_info(self) -> Dict:
-        """Get information about the loaded ONNX model"""
         return {
-            "onnx_model_path": self.onnx_path,
-            "processor_name": self.model_name,
-            "input_name": self.input_name,
-            "output_name": self.output_name,
             "sample_rate": self.sample_rate,
-            "session_providers": self.session.get_providers(),
         }
 class SimpleG2P:
     """Simple Grapheme-to-Phoneme converter for reference text"""
@@ -866,7 +962,7 @@ class SimplePronunciationAssessor:
     def __init__(self):
         print("Initializing Simple Pronunciation Assessor...")
-        self.wav2vec2_asr = Wav2Vec2CharacterASRONNX()  # Advanced mode
         self.whisper_asr = WhisperASR()  # Normal mode
         self.word_analyzer = WordAnalyzer()
         self.feedback_generator = SimpleFeedbackGenerator()

         }
+class Wav2Vec2CharacterASR:
+    """Wav2Vec2 character-level ASR with support for both ONNX and Transformers inference"""
     def __init__(
         self,
+        model_name: str = "facebook/wav2vec2-large-960h-lv60-self",
+        onnx: bool = False,
         onnx_model_path: str = "./wav2vec2_asr.onnx",
     ):
         """
+        Initialize Wav2Vec2 character-level model
         Args:
+            model_name: HuggingFace model name
+            onnx: If True, use ONNX runtime for inference. If False, use Transformers
+            onnx_model_path: Path to the ONNX model file (only used if onnx=True)
         """
+        self.model_name = model_name
+        self.use_onnx = onnx
+        self.onnx_model_path = onnx_model_path
+        self.sample_rate = 16000
+        print(f"Loading Wav2Vec2 character model: {model_name}")
+        print(f"Using {'ONNX' if onnx else 'Transformers'} for inference")
+        if self.use_onnx:
+            self._init_onnx_model()
+        else:
+            self._init_transformers_model()
+    def _init_onnx_model(self):
+        """Initialize ONNX model and processor"""
         # Check if ONNX model exists, if not create it
+        if not os.path.exists(self.onnx_model_path):
+            print(f"ONNX model not found at {self.onnx_model_path}. Creating it...")
+            self._create_onnx_model()
         try:
             # Load ONNX model
+            self.session = onnxruntime.InferenceSession(self.onnx_model_path)
             self.input_name = self.session.get_inputs()[0].name
             self.output_name = self.session.get_outputs()[0].name
             # Load processor
+            self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
             print("ONNX Wav2Vec2 character model loaded successfully")
         except Exception as e:
             print(f"Error loading ONNX model: {e}")
             raise
+    def _init_transformers_model(self):
+        """Initialize Transformers model and processor"""
+        try:
+            self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
+            self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)
+            self.model.eval()
+            print("Wav2Vec2 character model loaded successfully")
+        except Exception as e:
+            print(f"Error loading model {self.model_name}: {e}")
+            # Fallback to base model
+            fallback_model = "facebook/wav2vec2-base-960h"
+            print(f"Trying fallback model: {fallback_model}")
+            try:
+                self.processor = Wav2Vec2Processor.from_pretrained(fallback_model)
+                self.model = Wav2Vec2ForCTC.from_pretrained(fallback_model)
+                self.model.eval()
+                self.model_name = fallback_model
+                print("Fallback model loaded successfully")
+            except Exception as e2:
+                raise Exception(
+                    f"Failed to load both models. Original error: {e}, Fallback error: {e2}"
+                )
+    def _create_onnx_model(self):
         """Create ONNX model if it doesn't exist"""
         try:
             # Import the converter from model_convert
             from src.model_convert.wav2vec2onnx import Wav2Vec2ONNXConverter
             print("Creating new ONNX model...")
+            converter = Wav2Vec2ONNXConverter(self.model_name)
             created_path = converter.convert_to_onnx(
+                onnx_path=self.onnx_model_path,
                 input_length=160000,  # 10 seconds
                 opset_version=14,
             )
     def transcribe_to_characters(self, audio_path: str) -> Dict:
         """
+        Transcribe audio directly to characters (no language model correction)
         Returns raw character sequence as produced by the model
         """
+        if self.use_onnx:
+            return self._transcribe_onnx(audio_path)
+        else:
+            return self._transcribe_transformers(audio_path)
+    def _transcribe_onnx(self, audio_path: str) -> Dict:
+        """Transcribe using ONNX runtime"""
         try:
             # Load audio
             start_time = time.time()
             }
         except Exception as e:
+            print(f"ONNX transcription error: {e}")
+            return self._empty_result()
+    def _transcribe_transformers(self, audio_path: str) -> Dict:
+        """Transcribe using Transformers"""
+        try:
+            # Load audio
+            start_time = time.time()
+            speech, sr = librosa.load(audio_path, sr=self.sample_rate)
+            # Prepare input
+            input_values = self.processor(
+                speech, sampling_rate=self.sample_rate, return_tensors="pt"
+            ).input_values
+            # Get model predictions (no language model involved)
+            with torch.no_grad():
+                logits = self.model(input_values).logits
+                predicted_ids = torch.argmax(logits, dim=-1)
+            # Decode to characters directly
+            character_transcript = self.processor.batch_decode(predicted_ids)[0]
+            # Clean up character transcript
+            character_transcript = self._clean_character_transcript(
+                character_transcript
+            )
+            # Convert characters to phoneme-like representation
+            phoneme_like_transcript = self._characters_to_phoneme_representation(
+                character_transcript
+            )
+            logger.info(
+                f"Transformers transcription time: {time.time() - start_time:.2f}s"
+            )
             return {
+                "character_transcript": character_transcript,
+                "phoneme_representation": phoneme_like_transcript,
+                "raw_predicted_ids": predicted_ids[0].tolist(),
+                "confidence_scores": torch.softmax(logits, dim=-1)
+                .max(dim=-1)[0][0]
+                .tolist()[:100],  # Limit for JSON
             }
+        except Exception as e:
+            print(f"Transformers transcription error: {e}")
+            return self._empty_result()
     def _calculate_confidence_scores(self, logits: np.ndarray) -> List[float]:
         """Calculate confidence scores from logits using numpy"""
         # Apply softmax
         logger.info(f"Raw transcript before cleaning: {transcript}")
         cleaned = re.sub(r"\s+", " ", transcript)
         cleaned = cleaned.strip().lower()
         return cleaned
     def _characters_to_phoneme_representation(self, text: str) -> str:
         """Convert character-based transcript to phoneme-like representation for comparison"""
         if not text:
             return ""
         words = text.split()
         phoneme_words = []
         g2p = SimpleG2P()
         for word in words:
             try:
+                if g2p:
+                    word_data = g2p.text_to_phonemes(word)[0]
+                    phoneme_words.extend(word_data["phonemes"])
+                else:
+                    phoneme_words.extend(self._simple_letter_to_phoneme(word))
             except:
                 # Fallback: simple letter-to-sound mapping
                 phoneme_words.extend(self._simple_letter_to_phoneme(word))
         return phonemes
+    def _empty_result(self) -> Dict:
+        """Return empty result structure"""
         return {
+            "character_transcript": "",
+            "phoneme_representation": "",
+            "raw_predicted_ids": [],
+            "confidence_scores": [],
+        }
+    def get_model_info(self) -> Dict:
+        """Get information about the loaded model"""
+        info = {
+            "model_name": self.model_name,
             "sample_rate": self.sample_rate,
+            "inference_method": "ONNX" if self.use_onnx else "Transformers",
         }
+        if self.use_onnx:
+            info.update(
+                {
+                    "onnx_model_path": self.onnx_model_path,
+                    "input_name": self.input_name,
+                    "output_name": self.output_name,
+                    "session_providers": self.session.get_providers(),
+                }
+            )
+        return info
 class SimpleG2P:
     """Simple Grapheme-to-Phoneme converter for reference text"""
     def __init__(self):
         print("Initializing Simple Pronunciation Assessor...")
+        self.wav2vec2_asr = Wav2Vec2CharacterASR()  # Advanced mode
         self.whisper_asr = WhisperASR()  # Normal mode
         self.word_analyzer = WordAnalyzer()
         self.feedback_generator = SimpleFeedbackGenerator()

src/utils/speaking_utils.py ADDED Viewed

	@@ -0,0 +1,556 @@

+from typing import List, Dict
+import numpy as np
+import nltk
+import eng_to_ipa as ipa
+import re
+from collections import defaultdict
+try:
+    nltk.download("cmudict", quiet=True)
+    from nltk.corpus import cmudict
+except:
+    print("Warning: NLTK data not available")
+class SimpleG2P:
+    """Simple Grapheme-to-Phoneme converter for reference text"""
+    def __init__(self):
+        try:
+            self.cmu_dict = cmudict.dict()
+        except:
+            self.cmu_dict = {}
+            print("Warning: CMU dictionary not available")
+    def text_to_phonemes(self, text: str) -> List[Dict]:
+        """Convert text to phoneme sequence"""
+        words = self._clean_text(text).split()
+        phoneme_sequence = []
+        for word in words:
+            word_phonemes = self._get_word_phonemes(word)
+            phoneme_sequence.append(
+                {
+                    "word": word,
+                    "phonemes": word_phonemes,
+                    "ipa": self._get_ipa(word),
+                    "phoneme_string": " ".join(word_phonemes),
+                }
+            )
+        return phoneme_sequence
+    def get_reference_phoneme_string(self, text: str) -> str:
+        """Get reference phoneme string for comparison"""
+        phoneme_sequence = self.text_to_phonemes(text)
+        all_phonemes = []
+        for word_data in phoneme_sequence:
+            all_phonemes.extend(word_data["phonemes"])
+        return " ".join(all_phonemes)
+    def _clean_text(self, text: str) -> str:
+        """Clean text for processing"""
+        text = re.sub(r"[^\w\s\']", " ", text)
+        text = re.sub(r"\s+", " ", text)
+        return text.lower().strip()
+    def _get_word_phonemes(self, word: str) -> List[str]:
+        """Get phonemes for a word"""
+        word_lower = word.lower()
+        if word_lower in self.cmu_dict:
+            # Remove stress markers and convert to Wav2Vec2 phoneme format
+            phonemes = self.cmu_dict[word_lower][0]
+            clean_phonemes = [re.sub(r"[0-9]", "", p) for p in phonemes]
+            return self._convert_to_wav2vec_format(clean_phonemes)
+        else:
+            return self._estimate_phonemes(word)
+    def _convert_to_wav2vec_format(self, cmu_phonemes: List[str]) -> List[str]:
+        """Convert CMU phonemes to Wav2Vec2 format"""
+        # Mapping from CMU to Wav2Vec2/eSpeak phonemes
+        cmu_to_espeak = {
+            "AA": "ɑ",
+            "AE": "æ",
+            "AH": "ʌ",
+            "AO": "ɔ",
+            "AW": "aʊ",
+            "AY": "aɪ",
+            "EH": "ɛ",
+            "ER": "ɝ",
+            "EY": "eɪ",
+            "IH": "ɪ",
+            "IY": "i",
+            "OW": "oʊ",
+            "OY": "ɔɪ",
+            "UH": "ʊ",
+            "UW": "u",
+            "B": "b",
+            "CH": "tʃ",
+            "D": "d",
+            "DH": "ð",
+            "F": "f",
+            "G": "ɡ",
+            "HH": "h",
+            "JH": "dʒ",
+            "K": "k",
+            "L": "l",
+            "M": "m",
+            "N": "n",
+            "NG": "ŋ",
+            "P": "p",
+            "R": "r",
+            "S": "s",
+            "SH": "ʃ",
+            "T": "t",
+            "TH": "θ",
+            "V": "v",
+            "W": "w",
+            "Y": "j",
+            "Z": "z",
+            "ZH": "ʒ",
+        }
+        converted = []
+        for phoneme in cmu_phonemes:
+            converted_phoneme = cmu_to_espeak.get(phoneme, phoneme.lower())
+            converted.append(converted_phoneme)
+        return converted
+    def _get_ipa(self, word: str) -> str:
+        """Get IPA transcription"""
+        try:
+            return ipa.convert(word)
+        except:
+            return f"/{word}/"
+    def _estimate_phonemes(self, word: str) -> List[str]:
+        """Estimate phonemes for unknown words"""
+        # Basic phoneme estimation with eSpeak-style output
+        phoneme_map = {
+            "ch": ["tʃ"],
+            "sh": ["ʃ"],
+            "th": ["θ"],
+            "ph": ["f"],
+            "ck": ["k"],
+            "ng": ["ŋ"],
+            "qu": ["k", "w"],
+            "a": ["æ"],
+            "e": ["ɛ"],
+            "i": ["ɪ"],
+            "o": ["ʌ"],
+            "u": ["ʌ"],
+            "b": ["b"],
+            "c": ["k"],
+            "d": ["d"],
+            "f": ["f"],
+            "g": ["ɡ"],
+            "h": ["h"],
+            "j": ["dʒ"],
+            "k": ["k"],
+            "l": ["l"],
+            "m": ["m"],
+            "n": ["n"],
+            "p": ["p"],
+            "r": ["r"],
+            "s": ["s"],
+            "t": ["t"],
+            "v": ["v"],
+            "w": ["w"],
+            "x": ["k", "s"],
+            "y": ["j"],
+            "z": ["z"],
+        }
+        word = word.lower()
+        phonemes = []
+        i = 0
+        while i < len(word):
+            # Check 2-letter combinations first
+            if i <= len(word) - 2:
+                two_char = word[i : i + 2]
+                if two_char in phoneme_map:
+                    phonemes.extend(phoneme_map[two_char])
+                    i += 2
+                    continue
+            # Single character
+            char = word[i]
+            if char in phoneme_map:
+                phonemes.extend(phoneme_map[char])
+            i += 1
+        return phonemes
+class PhonemeComparator:
+    """Compare reference and learner phoneme sequences"""
+    def __init__(self):
+        # Vietnamese speakers' common phoneme substitutions
+        self.substitution_patterns = {
+            "θ": ["f", "s", "t"],  # TH → F, S, T
+            "ð": ["d", "z", "v"],  # DH → D, Z, V
+            "v": ["w", "f"],  # V → W, F
+            "r": ["l"],  # R → L
+            "l": ["r"],  # L → R
+            "z": ["s"],  # Z → S
+            "ʒ": ["ʃ", "z"],  # ZH → SH, Z
+            "ŋ": ["n"],  # NG → N
+        }
+        # Difficulty levels for Vietnamese speakers
+        self.difficulty_map = {
+            "θ": 0.9,  # th (think)
+            "ð": 0.9,  # th (this)
+            "v": 0.8,  # v
+            "z": 0.8,  # z
+            "ʒ": 0.9,  # zh (measure)
+            "r": 0.7,  # r
+            "l": 0.6,  # l
+            "w": 0.5,  # w
+            "f": 0.4,  # f
+            "s": 0.3,  # s
+            "ʃ": 0.5,  # sh
+            "tʃ": 0.4,  # ch
+            "dʒ": 0.5,  # j
+            "ŋ": 0.3,  # ng
+        }
+    def compare_phoneme_sequences(
+        self, reference_phonemes: str, learner_phonemes: str
+    ) -> List[Dict]:
+        """Compare reference and learner phoneme sequences"""
+        # Split phoneme strings
+        ref_phones = reference_phonemes.split()
+        learner_phones = learner_phonemes.split()
+        print(f"Reference phonemes: {ref_phones}")
+        print(f"Learner phonemes: {learner_phones}")
+        # Simple alignment comparison
+        comparisons = []
+        max_len = max(len(ref_phones), len(learner_phones))
+        for i in range(max_len):
+            ref_phoneme = ref_phones[i] if i < len(ref_phones) else ""
+            learner_phoneme = learner_phones[i] if i < len(learner_phones) else ""
+            if ref_phoneme and learner_phoneme:
+                # Both present - check accuracy
+                if ref_phoneme == learner_phoneme:
+                    status = "correct"
+                    score = 1.0
+                elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
+                    status = "acceptable"
+                    score = 0.7
+                else:
+                    status = "wrong"
+                    score = 0.2
+            elif ref_phoneme and not learner_phoneme:
+                # Missing phoneme
+                status = "missing"
+                score = 0.0
+            elif learner_phoneme and not ref_phoneme:
+                # Extra phoneme
+                status = "extra"
+                score = 0.0
+            else:
+                continue
+            comparison = {
+                "position": i,
+                "reference_phoneme": ref_phoneme,
+                "learner_phoneme": learner_phoneme,
+                "status": status,
+                "score": score,
+                "difficulty": self.difficulty_map.get(ref_phoneme, 0.3),
+            }
+            comparisons.append(comparison)
+        return comparisons
+    def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
+        """Check if learner phoneme is acceptable substitution for Vietnamese speakers"""
+        acceptable = self.substitution_patterns.get(reference, [])
+        return learner in acceptable
+# =============================================================================
+# WORD ANALYZER
+# =============================================================================
+class WordAnalyzer:
+    """Analyze word-level pronunciation accuracy using character-based ASR"""
+    def __init__(self):
+        self.g2p = SimpleG2P()
+        self.comparator = PhonemeComparator()
+    def analyze_words(self, reference_text: str, learner_phonemes: str) -> Dict:
+        """Analyze word-level pronunciation using phoneme representation from character ASR"""
+        # Get reference phonemes by word
+        reference_words = self.g2p.text_to_phonemes(reference_text)
+        # Get overall phoneme comparison
+        reference_phoneme_string = self.g2p.get_reference_phoneme_string(reference_text)
+        phoneme_comparisons = self.comparator.compare_phoneme_sequences(
+            reference_phoneme_string, learner_phonemes
+        )
+        # Map phonemes back to words
+        word_highlights = self._create_word_highlights(
+            reference_words, phoneme_comparisons
+        )
+        # Identify wrong words
+        wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)
+        return {
+            "word_highlights": word_highlights,
+            "phoneme_differences": phoneme_comparisons,
+            "wrong_words": wrong_words,
+        }
+    def _create_word_highlights(
+        self, reference_words: List[Dict], phoneme_comparisons: List[Dict]
+    ) -> List[Dict]:
+        """Create word highlighting data"""
+        word_highlights = []
+        phoneme_index = 0
+        for word_data in reference_words:
+            word = word_data["word"]
+            word_phonemes = word_data["phonemes"]
+            num_phonemes = len(word_phonemes)
+            # Get phoneme scores for this word
+            word_phoneme_scores = []
+            for j in range(num_phonemes):
+                if phoneme_index + j < len(phoneme_comparisons):
+                    comparison = phoneme_comparisons[phoneme_index + j]
+                    word_phoneme_scores.append(comparison["score"])
+            # Calculate word score
+            word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
+            # Create word highlight
+            highlight = {
+                "word": word,
+                "score": float(word_score),
+                "status": self._get_word_status(word_score),
+                "color": self._get_word_color(word_score),
+                "phonemes": word_phonemes,
+                "ipa": word_data["ipa"],
+                "phoneme_scores": word_phoneme_scores,
+                "phoneme_start_index": phoneme_index,
+                "phoneme_end_index": phoneme_index + num_phonemes - 1,
+            }
+            word_highlights.append(highlight)
+            phoneme_index += num_phonemes
+        return word_highlights
+    def _identify_wrong_words(
+        self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
+    ) -> List[Dict]:
+        """Identify words that were pronounced incorrectly"""
+        wrong_words = []
+        for word_highlight in word_highlights:
+            if word_highlight["score"] < 0.6:  # Threshold for wrong pronunciation
+                # Find specific phoneme errors for this word
+                start_idx = word_highlight["phoneme_start_index"]
+                end_idx = word_highlight["phoneme_end_index"]
+                wrong_phonemes = []
+                missing_phonemes = []
+                for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
+                    comparison = phoneme_comparisons[i]
+                    if comparison["status"] == "wrong":
+                        wrong_phonemes.append(
+                            {
+                                "expected": comparison["reference_phoneme"],
+                                "actual": comparison["learner_phoneme"],
+                                "difficulty": comparison["difficulty"],
+                            }
+                        )
+                    elif comparison["status"] == "missing":
+                        missing_phonemes.append(
+                            {
+                                "phoneme": comparison["reference_phoneme"],
+                                "difficulty": comparison["difficulty"],
+                            }
+                        )
+                wrong_word = {
+                    "word": word_highlight["word"],
+                    "score": word_highlight["score"],
+                    "expected_phonemes": word_highlight["phonemes"],
+                    "ipa": word_highlight["ipa"],
+                    "wrong_phonemes": wrong_phonemes,
+                    "missing_phonemes": missing_phonemes,
+                    "tips": self._get_vietnamese_tips(wrong_phonemes, missing_phonemes),
+                }
+                wrong_words.append(wrong_word)
+        return wrong_words
+    def _get_word_status(self, score: float) -> str:
+        """Get word status from score"""
+        if score >= 0.8:
+            return "excellent"
+        elif score >= 0.6:
+            return "good"
+        elif score >= 0.4:
+            return "needs_practice"
+        else:
+            return "poor"
+    def _get_word_color(self, score: float) -> str:
+        """Get color for word highlighting"""
+        if score >= 0.8:
+            return "#22c55e"  # Green
+        elif score >= 0.6:
+            return "#84cc16"  # Light green
+        elif score >= 0.4:
+            return "#eab308"  # Yellow
+        else:
+            return "#ef4444"  # Red
+    def _get_vietnamese_tips(
+        self, wrong_phonemes: List[Dict], missing_phonemes: List[Dict]
+    ) -> List[str]:
+        """Get Vietnamese-specific pronunciation tips"""
+        tips = []
+        # Tips for specific Vietnamese pronunciation challenges
+        vietnamese_tips = {
+            "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
+            "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
+            "v": "Chạm môi dưới vào răng trên, không dùng cả hai môi như tiếng Việt",
+            "r": "Cuộn lưỡi nhưng không chạm vào vòm miệng, không lăn lưỡi",
+            "l": "Đầu lưỡi chạm vào vòm miệng sau răng",
+            "z": "Giống âm 's' nhưng có rung dây thanh âm",
+            "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
+            "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'",
+        }
+        # Add tips for wrong phonemes
+        for wrong in wrong_phonemes:
+            expected = wrong["expected"]
+            actual = wrong["actual"]
+            if expected in vietnamese_tips:
+                tips.append(f"Âm '{expected}': {vietnamese_tips[expected]}")
+            else:
+                tips.append(f"Luyện âm '{expected}' thay vì '{actual}'")
+        # Add tips for missing phonemes
+        for missing in missing_phonemes:
+            phoneme = missing["phoneme"]
+            if phoneme in vietnamese_tips:
+                tips.append(f"Thiếu âm '{phoneme}': {vietnamese_tips[phoneme]}")
+        return tips
+class SimpleFeedbackGenerator:
+    """Generate simple, actionable feedback in Vietnamese"""
+    def generate_feedback(
+        self,
+        overall_score: float,
+        wrong_words: List[Dict],
+        phoneme_comparisons: List[Dict],
+    ) -> List[str]:
+        """Generate Vietnamese feedback"""
+        feedback = []
+        # Overall feedback in Vietnamese
+        if overall_score >= 0.8:
+            feedback.append("Phát âm rất tốt! Bạn đã làm xuất sắc.")
+        elif overall_score >= 0.6:
+            feedback.append("Phát âm khá tốt, còn một vài điểm cần cải thiện.")
+        elif overall_score >= 0.4:
+            feedback.append(
+                "Cần luyện tập thêm. Tập trung vào những từ được đánh dấu đỏ."
+            )
+        else:
+            feedback.append("Hãy luyện tập chậm và rõ ràng hơn.")
+        # Wrong words feedback
+        if wrong_words:
+            if len(wrong_words) <= 3:
+                word_names = [w["word"] for w in wrong_words]
+                feedback.append(f"Các từ cần luyện tập: {', '.join(word_names)}")
+            else:
+                feedback.append(
+                    f"Có {len(wrong_words)} từ cần luyện tập. Tập trung vào từng từ một."
+                )
+        # Most problematic phonemes
+        problem_phonemes = defaultdict(int)
+        for comparison in phoneme_comparisons:
+            if comparison["status"] in ["wrong", "missing"]:
+                phoneme = comparison["reference_phoneme"]
+                problem_phonemes[phoneme] += 1
+        if problem_phonemes:
+            most_difficult = sorted(
+                problem_phonemes.items(), key=lambda x: x[1], reverse=True
+            )
+            top_problem = most_difficult[0][0]
+            phoneme_tips = {
+                "θ": "Lưỡi giữa răng, thổi nhẹ",
+                "ð": "Lưỡi giữa răng, rung dây thanh",
+                "v": "Môi dưới chạm răng trên",
+                "r": "Cuộn lưỡi, không chạm vòm miệng",
+                "l": "Lưỡi chạm vòm miệng",
+                "z": "Như 's' nhưng rung dây thanh",
+            }
+            if top_problem in phoneme_tips:
+                feedback.append(
+                    f"Âm khó nhất '{top_problem}': {phoneme_tips[top_problem]}"
+                )
+        return feedback
+def convert_numpy_types(obj):
+    """Convert numpy types to Python native types"""
+    if isinstance(obj, np.integer):
+        return int(obj)
+    elif isinstance(obj, np.floating):
+        return float(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {key: convert_numpy_types(value) for key, value in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_numpy_types(item) for item in obj]
+    else:
+        return obj