Spaces:

ABAO77
/

Run_code_api

Sleeping

App Files Files Community

ABAO77 commited on Sep 1

Commit

c6480d4

1 Parent(s): dd47219

Implement code changes to enhance functionality and improve performance

Browse files

Files changed (2) hide show

src/apis/__pycache__/create_app.cpython-311.pyc +0 -0
src/apis/routes/speaking_route.py +751 -1481

src/apis/__pycache__/create_app.cpython-311.pyc CHANGED Viewed

Binary files a/src/apis/__pycache__/create_app.cpython-311.pyc and b/src/apis/__pycache__/create_app.cpython-311.pyc differ

src/apis/routes/speaking_route.py CHANGED Viewed

@@ -1,29 +1,28 @@
-# ENHANCED PRONUNCIATION API - MULTI-WORD SUPPORT
-# Supports any English word using CMU Dict + phoneme libraries
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException, APIRouter
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from typing import List, Dict, Optional, Tuple
 import tempfile
 import os
 import numpy as np
 import librosa
 import nltk
 import eng_to_ipa as ipa
-import pronouncing
-import requests
-import json
 import re
 from collections import defaultdict
 import warnings
 warnings.filterwarnings("ignore")
 # Download required NLTK data
 try:
     nltk.download("cmudict", quiet=True)
-    nltk.download("punkt", quiet=True)
     from nltk.corpus import cmudict
 except:
     print("Warning: NLTK data not available")
@@ -31,1119 +30,495 @@ except:
 # =============================================================================
 # MODELS
 # =============================================================================
-router = APIRouter(prefix="/speaking", tags=["AI"])
-class PronunciationResult(BaseModel):
     overall_score: float
-    status: str
     feedback: List[str]
-    words: List[Dict]
-    phoneme_details: List[Dict]
-    audio_info: Dict
-    processing_time: float
-    difficulty_analysis: Dict
-class WordPhonemeInfo(BaseModel):
-    word: str
-    phonemes: List[str]
-    ipa_transcription: str
-    syllables: List[str]
-    stress_pattern: List[int]
 # =============================================================================
-# ENHANCED PHONEME PROCESSOR
 # =============================================================================
-class EnhancedPhonemeProcessor:
-    """Advanced phoneme processing with multiple dictionaries"""
     def __init__(self):
-        self.sample_rate = 16000
-        # Load CMU dictionary
         try:
             self.cmu_dict = cmudict.dict()
         except:
             self.cmu_dict = {}
             print("Warning: CMU dictionary not available")
-        # Load comprehensive phoneme acoustic models
-        self.phoneme_models = self._load_comprehensive_phoneme_models()
-        # Phoneme difficulty for Vietnamese speakers
-        self.difficulty_map = {
-            # Very difficult for Vietnamese
-            "TH": 0.9,  # think, that
-            "DH": 0.9,  # this, then
-            "V": 0.8,  # very, love
-            "Z": 0.8,  # zoo, rise
-            "ZH": 0.9,  # measure, vision
-            "R": 0.7,  # red, car
-            "L": 0.6,  # love, well
-            "W": 0.5,  # water, well
-            # Moderately difficult
-            "F": 0.4,  # fish, life
-            "S": 0.3,  # see, this
-            "SH": 0.5,  # shoe, fish
-            "CH": 0.4,  # chair, much
-            "JH": 0.5,  # job, bridge
-            # Vowels - challenging distinctions
-            "IY": 0.3,  # beat
-            "IH": 0.6,  # bit
-            "EY": 0.4,  # bait
-            "EH": 0.5,  # bet
-            "AE": 0.7,  # bat
-            "AH": 0.4,  # but
-            "AO": 0.6,  # bought
-            "OW": 0.4,  # boat
-            "UH": 0.6,  # book
-            "UW": 0.4,  # boot
-            # Easier sounds
-            "P": 0.2,
-            "B": 0.2,
-            "T": 0.2,
-            "D": 0.2,
-            "K": 0.2,
-            "G": 0.2,
-            "M": 0.2,
-            "N": 0.2,
-            "NG": 0.3,
-        }
-    def get_word_phonemes(self, word: str) -> WordPhonemeInfo:
-        """Get comprehensive phoneme info for any English word"""
-        word_lower = word.lower().strip()
-        # Method 1: CMU Dictionary (most reliable)
-        cmu_phonemes = []
         if word_lower in self.cmu_dict:
-            # Get first pronunciation variant
-            cmu_phonemes = self.cmu_dict[word_lower][0]
-            # Remove stress markers (0,1,2) from vowels
-            cmu_phonemes = [re.sub(r"[0-9]", "", p) for p in cmu_phonemes]
-        # Method 2: eng_to_ipa library
-        ipa_transcription = ""
-        try:
-            ipa_transcription = ipa.convert(word)
-        except:
-            ipa_transcription = f"/{word}/"
-        # Method 3: pronouncing library for syllables
-        syllables = []
         try:
-            syllable_count = pronouncing.syllable_count(word)
-            # Simple syllable division
-            if syllable_count and len(word) > syllable_count:
-                syllable_length = len(word) // syllable_count
-                syllables = [
-                    word[i : i + syllable_length]
-                    for i in range(0, len(word), syllable_length)
-                ]
-            else:
-                syllables = [word]
         except:
-            syllables = [word]
-        # Extract stress pattern from CMU
-        stress_pattern = []
-        if word_lower in self.cmu_dict:
-            for phoneme in self.cmu_dict[word_lower][0]:
-                stress = re.findall(r"[0-9]", phoneme)
-                if stress:
-                    stress_pattern.append(int(stress[0]))
-        # Fallback phonemes if CMU not available
-        if not cmu_phonemes:
-            cmu_phonemes = self._estimate_phonemes(word)
-        return WordPhonemeInfo(
-            word=word,
-            phonemes=cmu_phonemes,
-            ipa_transcription=ipa_transcription,
-            syllables=syllables,
-            stress_pattern=stress_pattern,
-        )
     def _estimate_phonemes(self, word: str) -> List[str]:
         """Estimate phonemes for unknown words"""
-        # Simple grapheme-to-phoneme mapping
         phoneme_map = {
-            "ch": ["CH"],
-            "sh": ["SH"],
-            "th": ["TH"],
-            "ph": ["F"],
-            "ck": ["K"],
-            "ng": ["NG"],
-            "qu": ["K", "W"],
-            "a": ["AE"],
-            "e": ["EH"],
-            "i": ["IH"],
-            "o": ["AH"],
-            "u": ["AH"],
-            "b": ["B"],
-            "c": ["K"],
-            "d": ["D"],
-            "f": ["F"],
-            "g": ["G"],
-            "h": ["HH"],
-            "j": ["JH"],
-            "k": ["K"],
-            "l": ["L"],
-            "m": ["M"],
-            "n": ["N"],
-            "p": ["P"],
-            "r": ["R"],
-            "s": ["S"],
-            "t": ["T"],
-            "v": ["V"],
-            "w": ["W"],
-            "x": ["K", "S"],
-            "y": ["Y"],
-            "z": ["Z"],
         }
         word = word.lower()
         phonemes = []
         i = 0
         while i < len(word):
             # Check 2-letter combinations first
-            if i < len(word) - 1:
-                two_char = word[i : i + 2]
                 if two_char in phoneme_map:
                     phonemes.extend(phoneme_map[two_char])
                     i += 2
                     continue
             # Single character
             char = word[i]
             if char in phoneme_map:
                 phonemes.extend(phoneme_map[char])
             i += 1
         return phonemes
-    def _load_comprehensive_phoneme_models(self) -> Dict:
-        """Load comprehensive phoneme acoustic models"""
-        # Extended phoneme set với acoustic characteristics
-        models = {}
-        # VOWELS
-        vowel_models = {
-            "IY": {"f1": 270, "f2": 2300, "duration": 150, "type": "vowel"},  # beat
-            "IH": {"f1": 390, "f2": 1990, "duration": 120, "type": "vowel"},  # bit
-            "EY": {"f1": 400, "f2": 2100, "duration": 160, "type": "vowel"},  # bait
-            "EH": {"f1": 550, "f2": 1770, "duration": 130, "type": "vowel"},  # bet
-            "AE": {"f1": 690, "f2": 1660, "duration": 140, "type": "vowel"},  # bat
-            "AH": {"f1": 640, "f2": 1190, "duration": 110, "type": "vowel"},  # but
-            "AO": {"f1": 570, "f2": 840, "duration": 150, "type": "vowel"},  # bought
-            "OW": {"f1": 430, "f2": 1020, "duration": 160, "type": "vowel"},  # boat
-            "UH": {"f1": 450, "f2": 1030, "duration": 120, "type": "vowel"},  # book
-            "UW": {"f1": 310, "f2": 870, "duration": 150, "type": "vowel"},  # boot
-            "ER": {"f1": 490, "f2": 1350, "duration": 140, "type": "vowel"},  # bird
-            "AY": {"f1": 640, "f2": 1190, "duration": 180, "type": "vowel"},  # bite
-            "AW": {"f1": 640, "f2": 1190, "duration": 180, "type": "vowel"},  # bout
-            "OY": {"f1": 570, "f2": 840, "duration": 180, "type": "vowel"},  # boy
-        }
-        # CONSONANTS
-        consonant_models = {
-            # Stops
-            "P": {
-                "burst_energy": 0.8,
-                "duration": 80,
-                "type": "stop",
-                "voicing": False,
-            },
-            "B": {"burst_energy": 0.7, "duration": 85, "type": "stop", "voicing": True},
-            "T": {
-                "burst_energy": 0.9,
-                "duration": 75,
-                "type": "stop",
-                "voicing": False,
-            },
-            "D": {
-                "burst_energy": 0.75,
-                "duration": 80,
-                "type": "stop",
-                "voicing": True,
-            },
-            "K": {
-                "burst_energy": 0.85,
-                "duration": 70,
-                "type": "stop",
-                "voicing": False,
-            },
-            "G": {"burst_energy": 0.7, "duration": 75, "type": "stop", "voicing": True},
-            # Fricatives (challenging for Vietnamese)
-            "F": {
-                "high_freq": True,
-                "duration": 120,
-                "type": "fricative",
-                "voicing": False,
-            },
-            "V": {
-                "high_freq": True,
-                "duration": 110,
-                "type": "fricative",
-                "voicing": True,
-            },
-            "TH": {
-                "high_freq": True,
-                "duration": 130,
-                "type": "fricative",
-                "voicing": False,
-            },  # think
-            "DH": {
-                "high_freq": True,
-                "duration": 120,
-                "type": "fricative",
-                "voicing": True,
-            },  # this
-            "S": {
-                "very_high_freq": True,
-                "duration": 140,
-                "type": "fricative",
-                "voicing": False,
-            },
-            "Z": {
-                "very_high_freq": True,
-                "duration": 130,
-                "type": "fricative",
-                "voicing": True,
-            },
-            "SH": {
-                "high_freq": True,
-                "duration": 150,
-                "type": "fricative",
-                "voicing": False,
-            },  # shoe
-            "ZH": {
-                "high_freq": True,
-                "duration": 140,
-                "type": "fricative",
-                "voicing": True,
-            },  # measure
-            "HH": {
-                "breathy": True,
-                "duration": 100,
-                "type": "fricative",
-                "voicing": False,
-            },  # hello
-            # Affricates
-            "CH": {
-                "burst_fricative": True,
-                "duration": 160,
-                "type": "affricate",
-                "voicing": False,
-            },  # chair
-            "JH": {
-                "burst_fricative": True,
-                "duration": 150,
-                "type": "affricate",
-                "voicing": True,
-            },  # job
-            # Nasals
-            "M": {"nasal": True, "duration": 100, "type": "nasal", "voicing": True},
-            "N": {"nasal": True, "duration": 95, "type": "nasal", "voicing": True},
-            "NG": {
-                "nasal": True,
-                "duration": 105,
-                "type": "nasal",
-                "voicing": True,
-            },  # ring
-            # Liquids (challenging L/R distinction)
-            "L": {"lateral": True, "duration": 90, "type": "liquid", "voicing": True},
-            "R": {"retroflex": True, "duration": 95, "type": "liquid", "voicing": True},
-            # Glides
-            "Y": {"glide": True, "duration": 70, "type": "glide", "voicing": True},
-            "W": {"glide": True, "duration": 75, "type": "glide", "voicing": True},
-        }
-        # Combine models
-        models.update(vowel_models)
-        models.update(consonant_models)
-        return models
-    def get_difficulty_score(self, phonemes: List[str]) -> float:
-        """Calculate difficulty score for Vietnamese speakers"""
-        if not phonemes:
-            return 0.5
-        difficulties = []
-        for phoneme in phonemes:
-            # Remove stress markers
-            clean_phoneme = re.sub(r"[0-9]", "", phoneme)
-            difficulty = self.difficulty_map.get(clean_phoneme, 0.3)
-            difficulties.append(difficulty)
-        return np.mean(difficulties)
-    def score_phoneme_advanced(
-        self, phoneme: str, segment_features: Dict, context: Dict = None
-    ) -> float:
-        """Advanced phoneme scoring với context"""
-        clean_phoneme = re.sub(r"[0-9]", "", phoneme)
-        if clean_phoneme not in self.phoneme_models:
-            return 0.5  # Unknown phoneme
-        model = self.phoneme_models[clean_phoneme]
-        score = 0.0
-        # Type-specific scoring
-        if model["type"] == "vowel":
-            score = self._score_vowel(clean_phoneme, segment_features, model)
-        elif model["type"] == "fricative":
-            score = self._score_fricative(clean_phoneme, segment_features, model)
-        elif model["type"] == "stop":
-            score = self._score_stop(clean_phoneme, segment_features, model)
-        elif model["type"] in ["liquid", "nasal", "glide", "affricate"]:
-            score = self._score_other_consonant(clean_phoneme, segment_features, model)
-        # Context adjustments
-        if context:
-            score = self._apply_context_adjustments(score, clean_phoneme, context)
-        # Difficulty adjustment for Vietnamese speakers
-        difficulty = self.difficulty_map.get(clean_phoneme, 0.3)
-        # Easier scoring for more difficult phonemes
-        adjusted_score = score + (difficulty * 0.1)
-        return np.clip(adjusted_score, 0, 1)
-    def _score_vowel(self, phoneme: str, features: Dict, model: Dict) -> float:
-        """Score vowel phoneme"""
-        score = 0.0
-        # Energy check (vowels should have good energy)
-        if features.get("rms_mean", 0) > 0.01:
-            score += 0.3
-        # Spectral characteristics
-        centroid = features.get("spectral_centroid_mean", 0)
-        target_f2 = model.get("f2", 1500)
-        # F2 approximation from spectral centroid
-        f2_error = abs(centroid - target_f2) / target_f2
-        f2_score = max(0, 1 - f2_error)
-        score += 0.4 * f2_score
-        # Stability (vowels should be stable)
-        zcr = features.get("zcr_mean", 0)
-        if zcr < 0.1:  # Low zero crossing for vowels
-            score += 0.3
-        return score
-    def _score_fricative(self, phoneme: str, features: Dict, model: Dict) -> float:
-        """Score fricative phoneme"""
-        score = 0.0
-        # High frequency content for fricatives
-        centroid = features.get("spectral_centroid_mean", 0)
-        zcr = features.get("zcr_mean", 0)
-        if model.get("very_high_freq"):  # S, Z sounds
-            if centroid > 3000:
-                score += 0.4
-            if zcr > 0.2:
-                score += 0.4
-        elif model.get("high_freq"):  # F, V, TH, DH, SH, ZH
-            if centroid > 1500:
-                score += 0.4
-            if zcr > 0.15:
-                score += 0.3
-        # Voicing check
-        energy = features.get("rms_mean", 0)
-        if model.get("voicing") and energy > 0.01:  # Voiced fricatives
-            score += 0.2
-        elif not model.get("voicing") and energy < 0.05:  # Voiceless fricatives
-            score += 0.2
-        return score
-    def _score_stop(self, phoneme: str, features: Dict, model: Dict) -> float:
-        """Score stop consonant"""
-        score = 0.0
-        # Burst energy
-        energy = features.get("rms_mean", 0)
-        burst_threshold = 0.02 if model.get("voicing") else 0.03
-        if energy > burst_threshold:
-            score += 0.6
-        # Duration check
-        # Stops should be relatively short
-        score += 0.4  # Base score for presence
-        return score
-    def _score_other_consonant(
-        self, phoneme: str, features: Dict, model: Dict
-    ) -> float:
-        """Score other consonant types"""
-        score = 0.0
-        energy = features.get("rms_mean", 0)
-        centroid = features.get("spectral_centroid_mean", 0)
-        zcr = features.get("zcr_mean", 0)
-        if model["type"] == "liquid":
-            # L/R sounds - moderate energy, specific spectral characteristics
-            if 0.01 <= energy <= 0.08:
-                score += 0.3
-            if phoneme == "R" and centroid < 1800:  # R lowers F3
-                score += 0.4
-            elif phoneme == "L" and 1200 <= centroid <= 2200:
-                score += 0.4
-            score += 0.3  # Base score
-        elif model["type"] == "nasal":
-            # Nasal sounds - good energy, specific spectral pattern
-            if energy > 0.005:
-                score += 0.4
-            if 800 <= centroid <= 2000:
-                score += 0.3
-            score += 0.3
-        elif model["type"] == "glide":
-            # W/Y sounds - transition characteristics
-            if energy > 0.005:
-                score += 0.5
-            score += 0.5
-        elif model["type"] == "affricate":
-            # CH/JH - combination of stop + fricative
-            if energy > 0.02:  # Burst component
-                score += 0.3
-            if zcr > 0.1:  # Fricative component
-                score += 0.4
-            score += 0.3
-        return score
-    def _apply_context_adjustments(
-        self, score: float, phoneme: str, context: Dict
-    ) -> float:
-        """Apply contextual adjustments"""
-        # Position in word adjustments
-        position = context.get("position", "middle")
-        if position == "initial" and phoneme in ["TH", "DH"]:
-            score *= 1.1  # Easier in initial position
-        elif position == "final" and phoneme in ["T", "D", "K", "G"]:
-            score *= 0.9  # Harder in final position (Vietnamese tendency to drop)
-        # Surrounding phonemes
-        prev_phoneme = context.get("prev_phoneme")
-        next_phoneme = context.get("next_phoneme")
-        # Consonant clusters (difficult for Vietnamese)
-        if (
-            prev_phoneme
-            and prev_phoneme in ["S", "T", "K"]
-            and phoneme in ["T", "K", "P"]
-        ):
-            score *= 0.8  # Consonant clusters are harder
-        return score
 # =============================================================================
-# ENHANCED PRONUNCIATION ASSESSOR
 # =============================================================================
-class EnhancedPronunciationAssessor:
-    """Enhanced assessor supporting any English word"""
     def __init__(self):
-        self.phoneme_processor = EnhancedPhonemeProcessor()
-        self.sample_rate = 16000
-    def process_audio_file(self, file_path: str, reference_text: str) -> Dict:
-        """Process audio file with enhanced phoneme analysis"""
-        # Load and validate audio
-        audio, sr = librosa.load(file_path, sr=self.sample_rate)
-        duration = len(audio) / sr
-        max_amplitude = np.max(np.abs(audio))
-        # Audio quality analysis
-        audio_info = self._analyze_audio_quality(audio, duration, max_amplitude)
-        # Extract comprehensive features
-        features = self._extract_comprehensive_features(audio)
-        # Text analysis
-        text_analysis = self._analyze_text(reference_text)
-        # Pronunciation assessment
-        pronunciation_analysis = self._assess_pronunciation(
-            audio, features, reference_text, text_analysis
-        )
-        return {
-            "audio_info": audio_info,
-            "text_analysis": text_analysis,
-            "pronunciation_analysis": pronunciation_analysis,
-            "features": features,
-        }
-    def _analyze_audio_quality(
-        self, audio: np.ndarray, duration: float, max_amplitude: float
-    ) -> Dict:
-        """Comprehensive audio quality analysis"""
-        issues = []
-        quality_score = 1.0
-        # Duration checks
-        if duration < 0.5:
-            issues.append("too_short")
-            quality_score *= 0.5
-        elif duration > 30:
-            issues.append("too_long")
-            quality_score *= 0.8
-        # Amplitude checks
-        if max_amplitude < 0.005:
-            issues.append("too_quiet")
-            quality_score *= 0.6
-        elif max_amplitude > 0.98:
-            issues.append("clipped")
-            quality_score *= 0.7
-        # Noise analysis
-        noise_floor = np.mean(np.abs(audio[: int(0.1 * len(audio))]))  # First 100ms
-        if noise_floor > 0.02:
-            issues.append("noisy")
-            quality_score *= 0.8
-        # Signal-to-noise ratio
-        signal_power = np.mean(audio**2)
-        snr = 10 * np.log10(signal_power / (noise_floor**2 + 1e-10))
-        return {
-            "duration": duration,
-            "max_amplitude": max_amplitude,
-            "noise_floor": noise_floor,
-            "snr": snr,
-            "quality_score": quality_score,
-            "issues": issues,
-            "quality_status": "good" if not issues else ",".join(issues),
         }
-    def _extract_comprehensive_features(self, audio: np.ndarray) -> Dict:
-        """Extract comprehensive acoustic features"""
-        features = {}
-        # Basic features
-        features["mfcc"] = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13)
-        features["mfcc_mean"] = np.mean(features["mfcc"], axis=1).tolist()
-        # Energy features
-        rms = librosa.feature.rms(y=audio, hop_length=512)[0]
-        features["rms"] = rms.tolist()
-        features["rms_mean"] = float(np.mean(rms))
-        features["rms_std"] = float(np.std(rms))
-        # Spectral features
-        spectral_centroid = librosa.feature.spectral_centroid(
-            y=audio, sr=self.sample_rate
-        )[0]
-        features["spectral_centroid"] = spectral_centroid.tolist()
-        features["spectral_centroid_mean"] = float(np.mean(spectral_centroid))
-        features["spectral_centroid_std"] = float(np.std(spectral_centroid))
-        # Additional spectral features
-        spectral_bandwidth = librosa.feature.spectral_bandwidth(
-            y=audio, sr=self.sample_rate
-        )[0]
-        features["spectral_bandwidth_mean"] = float(np.mean(spectral_bandwidth))
-        spectral_rolloff = librosa.feature.spectral_rolloff(
-            y=audio, sr=self.sample_rate
-        )[0]
-        features["spectral_rolloff_mean"] = float(np.mean(spectral_rolloff))
-        # Zero crossing rate
-        zcr = librosa.feature.zero_crossing_rate(audio, hop_length=512)[0]
-        features["zcr"] = zcr.tolist()
-        features["zcr_mean"] = float(np.mean(zcr))
-        features["zcr_std"] = float(np.std(zcr))
-        # Pitch analysis
-        pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate)
-        f0 = []
-        for t in range(pitches.shape[1]):
-            index = magnitudes[:, t].argmax()
-            pitch = pitches[index, t]
-            f0.append(
-                float(pitch) if pitch > 80 else 0.0
-            )  # Filter out very low frequencies
-        features["f0"] = f0
-        valid_f0 = [f for f in f0 if f > 0]
-        features["f0_mean"] = float(np.mean(valid_f0)) if valid_f0 else 0.0
-        features["f0_std"] = float(np.std(valid_f0)) if valid_f0 else 0.0
-        # Formant estimation (simplified)
-        features["formants"] = self._estimate_formants(audio)
-        return features
-    def _analyze_text(self, text: str) -> Dict:
-        """Analyze reference text for phonemes and difficulty"""
-        words = text.lower().strip().split()
-        text_info = {
-            "words": [],
-            "total_phonemes": 0,
-            "difficulty_score": 0,
-            "challenging_sounds": [],
         }
-        all_phonemes = []
-        for word in words:
-            word_info = self.phoneme_processor.get_word_phonemes(word)
-            # Calculate word difficulty
-            word_difficulty = self.phoneme_processor.get_difficulty_score(
-                word_info.phonemes
-            )
-            # Find challenging phonemes
-            challenging = []
-            for phoneme in word_info.phonemes:
-                clean_phoneme = re.sub(r"[0-9]", "", phoneme)
-                difficulty = self.phoneme_processor.difficulty_map.get(clean_phoneme, 0)
-                if difficulty > 0.6:
-                    challenging.append(clean_phoneme)
-            word_data = {
-                "word": word,
-                "phonemes": word_info.phonemes,
-                "ipa": word_info.ipa_transcription,
-                "syllables": word_info.syllables,
-                "difficulty": word_difficulty,
-                "challenging_phonemes": challenging,
             }
-            text_info["words"].append(word_data)
-            all_phonemes.extend(word_info.phonemes)
-            text_info["challenging_sounds"].extend(challenging)
-        text_info["total_phonemes"] = len(all_phonemes)
-        text_info["difficulty_score"] = self.phoneme_processor.get_difficulty_score(
-            all_phonemes
-        )
-        text_info["challenging_sounds"] = list(
-            set(text_info["challenging_sounds"])
-        )  # Remove duplicates
-        return text_info
-    def _assess_pronunciation(
-        self, audio: np.ndarray, features: Dict, text: str, text_analysis: Dict
-    ) -> Dict:
-        """Comprehensive pronunciation assessment"""
-        words = text.lower().strip().split()
-        word_segments = self._segment_words_advanced(audio, features, len(words))
-        word_results = []
-        phoneme_results = []
-        for i, word in enumerate(words):
-            if i < len(word_segments):
-                word_audio = word_segments[i]
-                word_info = text_analysis["words"][i]
-                # Assess word
-                word_result = self._assess_word_comprehensive(
-                    word_audio, word_info, features, i, len(words)
-                )
-                word_results.append(word_result)
-                phoneme_results.extend(word_result["phoneme_details"])
-        # Calculate overall metrics
-        overall_score = (
-            np.mean([wr["score"] for wr in word_results]) if word_results else 0.0
-        )
-        # Generate comprehensive feedback
-        feedback = self._generate_comprehensive_feedback(
-            word_results, text_analysis, features, overall_score
-        )
-        # Difficulty analysis
-        difficulty_analysis = self._analyze_difficulty_performance(
-            word_results, text_analysis
         )
         return {
-            "overall_score": overall_score,
-            "words": word_results,
-            "phoneme_details": phoneme_results,
-            "feedback": feedback,
-            "status": self._get_status(overall_score),
-            "difficulty_analysis": difficulty_analysis,
         }
-    def _segment_words_advanced(
-        self, audio: np.ndarray, features: Dict, num_words: int
-    ) -> List[np.ndarray]:
-        """Advanced word segmentation using energy and spectral cues"""
-        if num_words == 1:
-            return [audio]
-        # Use RMS energy to find word boundaries
-        rms = features["rms"]
-        # Find energy peaks (potential word centers)
-        from scipy.signal import find_peaks
-        # Smooth RMS for better peak detection
-        window_size = min(5, len(rms) // 4)
-        if window_size > 0:
-            rms_smooth = np.convolve(
-                rms, np.ones(window_size) / window_size, mode="same"
-            )
-        else:
-            rms_smooth = rms
-        peaks, _ = find_peaks(
-            rms_smooth,
-            height=np.mean(rms_smooth) * 0.5,
-            distance=len(rms) // (num_words * 2),
-        )
-        # If we don't find enough peaks, fall back to equal division
-        if len(peaks) < num_words:
-            segment_length = len(audio) // num_words
-            segments = []
-            for i in range(num_words):
-                start = i * segment_length
-                end = start + segment_length if i < num_words - 1 else len(audio)
-                segments.append(audio[start:end])
-            return segments
-        # Use peaks to define word boundaries
-        hop_length = 512
-        peak_times = librosa.frames_to_samples(peaks, hop_length=hop_length)
-        segments = []
-        for i in range(num_words):
-            if i == 0:
-                start = 0
-                end = peak_times[min(i, len(peak_times) - 1)] + len(audio) // (
-                    num_words * 4
-                )
-            elif i == num_words - 1:
-                start = peak_times[min(i - 1, len(peak_times) - 1)] - len(audio) // (
-                    num_words * 4
-                )
-                end = len(audio)
-            else:
-                start = peak_times[min(i - 1, len(peak_times) - 1)] - len(audio) // (
-                    num_words * 6
-                )
-                end = peak_times[min(i, len(peak_times) - 1)] + len(audio) // (
-                    num_words * 6
-                )
-            start = max(0, start)
-            end = min(len(audio), end)
-            segments.append(audio[start:end])
-        return segments
-    def _assess_word_comprehensive(
-        self,
-        word_audio: np.ndarray,
-        word_info: Dict,
-        global_features: Dict,
-        word_index: int,
-        total_words: int,
-    ) -> Dict:
-        """Comprehensive word assessment"""
-        if len(word_audio) < 500:
-            return {
-                "word": word_info["word"],
-                "score": 0.2,
-                "status": "poor",
-                "issues": ["too_short"],
-                "phoneme_details": [],
             }
-        # Extract word-level features
-        word_features = self._extract_word_features(word_audio)
-        # Assess each phoneme
-        phonemes = word_info["phonemes"]
-        phoneme_segments = self._segment_phonemes(word_audio, len(phonemes))
-        phoneme_scores = []
-        phoneme_details = []
-        for i, (phoneme, segment) in enumerate(zip(phonemes, phoneme_segments)):
-            if len(segment) > 100:  # Minimum segment length
-                segment_features = self._extract_segment_features(segment)
-                # Context information
-                context = {
-                    "position": (
-                        "initial"
-                        if i == 0
-                        else "final" if i == len(phonemes) - 1 else "middle"
-                    ),
-                    "prev_phoneme": phonemes[i - 1] if i > 0 else None,
-                    "next_phoneme": phonemes[i + 1] if i < len(phonemes) - 1 else None,
-                    "word_position": word_index / total_words,
                 }
-                score = self.phoneme_processor.score_phoneme_advanced(
-                    phoneme, segment_features, context
-                )
-                phoneme_scores.append(score)
-                phoneme_details.append(
-                    {
-                        "phoneme": phoneme,
-                        "score": score,
-                        "position": context["position"],
-                        "difficulty": self.phoneme_processor.difficulty_map.get(
-                            re.sub(r"[0-9]", "", phoneme), 0.3
-                        ),
-                        "word": word_info["word"],
-                    }
-                )
-        # Word-level score
-        word_score = np.mean(phoneme_scores) if phoneme_scores else 0.0
-        # Detect issues
-        issues = []
-        if word_score < 0.3:
-            issues.append("very_poor_clarity")
-        if word_features.get("rms_mean", 0) < 0.005:
-            issues.append("too_quiet")
-        if word_features.get("zcr_mean", 0) > 0.3:
-            issues.append("too_noisy")
-        return {
-            "word": word_info["word"],
-            "score": word_score,
-            "status": self._get_word_status(word_score),
-            "phonemes": phonemes,
-            "phoneme_scores": phoneme_scores,
-            "phoneme_details": phoneme_details,
-            "ipa": word_info["ipa"],
-            "syllables": word_info["syllables"],
-            "difficulty": word_info["difficulty"],
-            "issues": issues,
-        }
-    def _extract_word_features(self, word_audio: np.ndarray) -> Dict:
-        """Extract features for word segment"""
-        if len(word_audio) < 100:
-            return {}
-        mfcc = librosa.feature.mfcc(y=word_audio, sr=self.sample_rate, n_mfcc=13)
-        rms = librosa.feature.rms(y=word_audio)[0]
-        centroid = librosa.feature.spectral_centroid(y=word_audio, sr=self.sample_rate)[
-            0
-        ]
-        zcr = librosa.feature.zero_crossing_rate(word_audio)[0]
-        return {
-            "mfcc_mean": np.mean(mfcc, axis=1).tolist(),
-            "rms_mean": float(np.mean(rms)),
-            "spectral_centroid_mean": float(np.mean(centroid)),
-            "zcr_mean": float(np.mean(zcr)),
-        }
-    def _segment_phonemes(
-        self, word_audio: np.ndarray, num_phonemes: int
-    ) -> List[np.ndarray]:
-        """Segment word audio into phonemes"""
-        if num_phonemes <= 1:
-            return [word_audio]
-        segment_length = len(word_audio) // num_phonemes
-        segments = []
-        for i in range(num_phonemes):
-            start = i * segment_length
-            end = start + segment_length if i < num_phonemes - 1 else len(word_audio)
-            segments.append(word_audio[start:end])
-        return segments
-    def _extract_segment_features(self, segment: np.ndarray) -> Dict:
-        """Extract features for phoneme segment"""
-        if len(segment) < 50:
-            return {}
-        # Basic features for short segments
-        rms_mean = float(np.mean(librosa.feature.rms(y=segment)[0]))
-        zcr_mean = float(np.mean(librosa.feature.zero_crossing_rate(segment)[0]))
-        # Spectral centroid
-        centroid = librosa.feature.spectral_centroid(y=segment, sr=self.sample_rate)[0]
-        centroid_mean = float(np.mean(centroid))
-        # MFCC for short segment
-        if len(segment) > 512:
-            mfcc = librosa.feature.mfcc(y=segment, sr=self.sample_rate, n_mfcc=5)
-            mfcc_mean = np.mean(mfcc, axis=1).tolist()
-        else:
-            mfcc_mean = [0] * 5
-        return {
-            "rms_mean": rms_mean,
-            "zcr_mean": zcr_mean,
-            "spectral_centroid_mean": centroid_mean,
-            "mfcc_mean": mfcc_mean,
-        }
-    def _generate_comprehensive_feedback(
-        self,
-        word_results: List[Dict],
-        text_analysis: Dict,
-        features: Dict,
-        overall_score: float,
-    ) -> List[str]:
-        """Generate comprehensive feedback"""
-        feedback = []
-        # Overall performance feedback
-        if overall_score >= 0.85:
-            feedback.append(
-                "🎉 Outstanding pronunciation! You sound very natural and clear."
-            )
-        elif overall_score >= 0.7:
-            feedback.append(
-                "👍 Great job! Your pronunciation is quite good with room for minor improvements."
-            )
-        elif overall_score >= 0.5:
-            feedback.append(
-                "📚 Good progress! Keep practicing the areas highlighted below."
-            )
-        elif overall_score >= 0.3:
-            feedback.append(
-                "🔄 Keep working on it! Focus on clarity and the specific sounds mentioned."
-            )
-        else:
-            feedback.append(
-                "💪 Don't give up! Start with slower, clearer pronunciation."
-            )
-        # Audio quality feedback
-        audio_quality = features.get("rms_mean", 0)
-        if audio_quality < 0.01:
-            feedback.append(
-                "🔊 Try speaking louder and more clearly - your recording was quite quiet."
-            )
-        elif audio_quality > 0.15:
-            feedback.append("🔉 Good volume level! Your voice comes through clearly.")
-        # Pitch variation feedback
-        pitch_std = features.get("f0_std", 0)
-        if pitch_std < 20:
-            feedback.append(
-                "🎵 Try adding more natural pitch variation to sound more engaging."
-            )
-        elif pitch_std > 80:
-            feedback.append(
-                "🎵 Good pitch variation! Your speech sounds natural and expressive."
-            )
-        # Word-specific feedback
-        poor_words = [wr for wr in word_results if wr["score"] < 0.5]
-        if poor_words:
-            word_names = [w["word"] for w in poor_words]
-            feedback.append(f"🎯 Focus extra practice on: {', '.join(word_names)}")
-        # Phoneme-specific feedback for Vietnamese speakers
-        all_challenging = []
-        for word_result in word_results:
-            for phoneme_detail in word_result.get("phoneme_details", []):
-                if phoneme_detail["score"] < 0.5 and phoneme_detail["difficulty"] > 0.6:
-                    all_challenging.append(phoneme_detail["phoneme"])
-        if all_challenging:
-            unique_challenging = list(set(all_challenging))
-            vietnamese_tips = {
-                "TH": "Put your tongue between your teeth and blow air gently",
-                "DH": "Same tongue position as TH, but vibrate your vocal cords",
-                "V": "Touch your bottom lip to your top teeth, then voice",
-                "R": "Curl your tongue without touching the roof of your mouth",
-                "L": "Touch your tongue tip to the roof of your mouth",
-                "Z": "Like 'S' but with vocal cord vibration",
-            }
-            for phoneme in unique_challenging[:3]:  # Top 3 challenging
-                clean_phoneme = re.sub(r"[0-9]", "", phoneme)
-                if clean_phoneme in vietnamese_tips:
-                    feedback.append(
-                        f"🔤 {clean_phoneme} sound: {vietnamese_tips[clean_phoneme]}"
-                    )
-        # Difficulty-based encouragement
-        text_difficulty = text_analysis["difficulty_score"]
-        if text_difficulty > 0.7 and overall_score > 0.6:
-            feedback.append(
-                "💪 Impressive! You tackled some very challenging sounds for Vietnamese speakers."
-            )
-        elif text_difficulty < 0.3 and overall_score < 0.7:
-            feedback.append("📈 Try some more challenging words as you improve!")
-        return feedback
-    def _analyze_difficulty_performance(
-        self, word_results: List[Dict], text_analysis: Dict
-    ) -> Dict:
-        """Analyze performance vs difficulty"""
-        easy_phonemes = []  # difficulty < 0.4
-        medium_phonemes = []  # 0.4 <= difficulty < 0.7
-        hard_phonemes = []  # difficulty >= 0.7
-        for word_result in word_results:
-            for phoneme_detail in word_result.get("phoneme_details", []):
-                difficulty = phoneme_detail["difficulty"]
-                score = phoneme_detail["score"]
-                if difficulty < 0.4:
-                    easy_phonemes.append(score)
-                elif difficulty < 0.7:
-                    medium_phonemes.append(score)
-                else:
-                    hard_phonemes.append(score)
-        return {
-            "easy_sounds_avg": float(np.mean(easy_phonemes)) if easy_phonemes else 0.0,
-            "medium_sounds_avg": (
-                float(np.mean(medium_phonemes)) if medium_phonemes else 0.0
-            ),
-            "hard_sounds_avg": float(np.mean(hard_phonemes)) if hard_phonemes else 0.0,
-            "total_challenging_sounds": len(hard_phonemes),
-            "mastered_difficult_sounds": len([s for s in hard_phonemes if s > 0.7]),
-            "text_difficulty": text_analysis["difficulty_score"],
-        }
     def _get_word_status(self, score: float) -> str:
         """Get word status from score"""
         if score >= 0.8:
@@ -1154,475 +529,370 @@ class EnhancedPronunciationAssessor:
             return "needs_practice"
         else:
             return "poor"
-    def _get_status(self, score: float) -> str:
-        """Get overall status"""
-        return self._get_word_status(score)
 # =============================================================================
-# ENHANCED FASTAPI APP
 # =============================================================================
-# Initialize enhanced processor
-assessor = EnhancedPronunciationAssessor()
 # =============================================================================
-# ENHANCED ENDPOINTS
 # =============================================================================
-@router.post("/assess", response_model=PronunciationResult)
 async def assess_pronunciation(
-    audio: UploadFile = File(..., description="Audio file"),
-    reference_text: str = Form(..., description="Any English text"),
-    difficulty_level: str = Form("medium", description="easy, medium, hard"),
 ):
     """
-    Assess pronunciation for ANY English text
-    Supports 60,000+ words from CMU Pronouncing Dictionary
     """
     import time
     start_time = time.time()
-    print(f"Starting pronunciation assessment...")
-    print("Reference text:", reference_text)
-    print("Difficulty level:", difficulty_level)
-    print("Audio filename:", audio.filename if audio else "None")
     # Validate inputs
     if not reference_text.strip():
-        print("Validation failed: Reference text is empty")
         raise HTTPException(status_code=400, detail="Reference text cannot be empty")
-    if len(reference_text) > 1000:
-        print("Validation failed: Reference text too long")
-        raise HTTPException(
-            status_code=400, detail="Reference text too long (max 1000 characters)"
-        )
-    # Check if text contains only valid characters
-    # Updated regex to be more permissive and include common punctuation like commas
     if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text):
-        print("Validation failed: Invalid characters in text")
-        print("Text that failed validation:", repr(reference_text))
         raise HTTPException(
             status_code=400,
-            detail="Text contains invalid characters. Only English letters, spaces, and basic punctuation (,.'-!?;:) allowed.",
         )
     try:
-        # Save uploaded file
-        print("Saving uploaded file...")
-        # Handle cases where filename might be None or empty
         file_extension = ".wav"
-        if audio.filename:
-            file_extension = f".{audio.filename.split('.')[-1]}" if '.' in audio.filename else ".wav"
-        with tempfile.NamedTemporaryFile(
-            delete=False, suffix=file_extension
-        ) as tmp_file:
             content = await audio.read()
             tmp_file.write(content)
             tmp_file.flush()
-            print("File saved to:", tmp_file.name)
-            print("File size:", len(content), "bytes")
-            # Process with enhanced assessor
-            print("Processing audio file...")
-            result = assessor.process_audio_file(tmp_file.name, reference_text)
-            print("Audio processing completed")
-            # Clean up
-            os.unlink(tmp_file.name)
-        # Apply difficulty adjustments
-        analysis = result["pronunciation_analysis"]
-        if difficulty_level == "easy":
-            analysis["overall_score"] = min(1.0, analysis["overall_score"] * 1.2)
-            for word in analysis["words"]:
-                word["score"] = min(1.0, word["score"] * 1.2)
-        elif difficulty_level == "hard":
-            analysis["overall_score"] = analysis["overall_score"] * 0.8
-            for word in analysis["words"]:
-                word["score"] = word["score"] * 0.8
         processing_time = time.time() - start_time
-        print("Processing completed successfully in", processing_time, "seconds")
-        return PronunciationResult(
-            overall_score=analysis["overall_score"],
-            status=analysis["status"],
-            feedback=analysis["feedback"],
-            words=analysis["words"],
-            phoneme_details=analysis["phoneme_details"],
-            audio_info=result["audio_info"],
-            processing_time=processing_time,
-            difficulty_analysis=analysis["difficulty_analysis"],
-        )
     except Exception as e:
-        print("Exception occurred during processing:", str(e))
         import traceback
         traceback.print_exc()
-        raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
 @router.get("/phonemes/{word}")
 async def get_word_phonemes(word: str):
-    """Get comprehensive phoneme information for ANY English word"""
     try:
-        word_info = assessor.phoneme_processor.get_word_phonemes(word)
-        # Calculate difficulty for Vietnamese speakers
-        difficulty = assessor.phoneme_processor.get_difficulty_score(word_info.phonemes)
-        # Get challenging phonemes
-        challenging_phonemes = []
-        for phoneme in word_info.phonemes:
-            clean_phoneme = re.sub(r"[0-9]", "", phoneme)
-            phoneme_difficulty = assessor.phoneme_processor.difficulty_map.get(
-                clean_phoneme, 0
-            )
-            if phoneme_difficulty > 0.6:
-                challenging_phonemes.append(
-                    {
-                        "phoneme": clean_phoneme,
-                        "difficulty": phoneme_difficulty,
-                        "tips": get_phoneme_tips(clean_phoneme),
-                    }
-                )
         return {
             "word": word,
-            "phonemes": word_info.phonemes,
-            "ipa_transcription": word_info.ipa_transcription,
-            "syllables": word_info.syllables,
-            "stress_pattern": word_info.stress_pattern,
-            "difficulty_score": difficulty,
-            "difficulty_level": (
-                "hard" if difficulty > 0.7 else "medium" if difficulty > 0.4 else "easy"
-            ),
-            "challenging_phonemes": challenging_phonemes,
-            "pronunciation_tips": get_word_pronunciation_tips(word, word_info.phonemes),
         }
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error processing word: {str(e)}")
-@router.post("/analyze/text")
-async def analyze_text_difficulty(text: str = Form(...)):
-    """Analyze pronunciation difficulty of any English text"""
     try:
-        text_analysis = assessor._analyze_text(text)
-        return {
-            "text": text,
-            "word_count": len(text_analysis["words"]),
-            "total_phonemes": text_analysis["total_phonemes"],
-            "overall_difficulty": text_analysis["difficulty_score"],
-            "difficulty_level": (
-                "hard"
-                if text_analysis["difficulty_score"] > 0.7
-                else "medium" if text_analysis["difficulty_score"] > 0.4 else "easy"
-            ),
-            "challenging_sounds": text_analysis["challenging_sounds"],
-            "word_breakdown": text_analysis["words"],
-            "recommendations": get_text_recommendations(text_analysis),
         }
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Text analysis error: {str(e)}")
-@router.get("/dictionary/search")
-async def search_dictionary(query: str, limit: int = 20):
-    """Search CMU dictionary for words containing query"""
     try:
-        cmu_dict = assessor.phoneme_processor.cmu_dict
-        # Search for words containing the query
-        matching_words = []
-        query_lower = query.lower()
-        for word in cmu_dict.keys():
-            if query_lower in word and len(matching_words) < limit:
-                word_info = assessor.phoneme_processor.get_word_phonemes(word)
-                difficulty = assessor.phoneme_processor.get_difficulty_score(
-                    word_info.phonemes
-                )
-                matching_words.append(
-                    {
-                        "word": word,
-                        "phonemes": word_info.phonemes,
-                        "ipa": word_info.ipa_transcription,
-                        "difficulty": difficulty,
-                        "difficulty_level": (
-                            "hard"
-                            if difficulty > 0.7
-                            else "medium" if difficulty > 0.4 else "easy"
-                        ),
-                    }
-                )
-        # Sort by difficulty (easiest first)
-        matching_words.sort(key=lambda x: x["difficulty"])
-        return {"query": query, "found": len(matching_words), "words": matching_words}
     except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Dictionary search error: {str(e)}"
-        )
-@router.get("/practice/level/{level}")
-async def get_practice_words(level: str, count: int = 10):
-    """Get practice words by difficulty level"""
-    if level not in ["easy", "medium", "hard"]:
-        raise HTTPException(
-            status_code=400, detail="Level must be easy, medium, or hard"
-        )
-    try:
-        cmu_dict = assessor.phoneme_processor.cmu_dict
-        practice_words = []
-        # Define difficulty ranges
-        if level == "easy":
-            difficulty_range = (0, 0.4)
-        elif level == "medium":
-            difficulty_range = (0.4, 0.7)
-        else:  # hard
-            difficulty_range = (0.7, 1.0)
-        # Sample words from dictionary
-        word_list = list(cmu_dict.keys())
-        np.random.shuffle(word_list)
-        for word in word_list:
-            if len(practice_words) >= count:
-                break
-            # Skip very short or very long words
-            if len(word) < 3 or len(word) > 12:
-                continue
-            # Skip words with special characters
-            if not word.isalpha():
-                continue
-            word_info = assessor.phoneme_processor.get_word_phonemes(word)
-            difficulty = assessor.phoneme_processor.get_difficulty_score(
-                word_info.phonemes
-            )
-            if difficulty_range[0] <= difficulty <= difficulty_range[1]:
-                practice_words.append(
-                    {
-                        "word": word,
-                        "phonemes": word_info.phonemes,
-                        "ipa": word_info.ipa_transcription,
-                        "difficulty": difficulty,
-                        "tips": get_word_pronunciation_tips(word, word_info.phonemes),
-                    }
-                )
         return {
-            "level": level,
-            "difficulty_range": difficulty_range,
-            "count": len(practice_words),
-            "words": practice_words,
         }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Practice words error: {str(e)}")
 # =============================================================================
 # HELPER FUNCTIONS
 # =============================================================================
-def get_phoneme_tips(phoneme: str) -> List[str]:
-    """Get pronunciation tips for specific phonemes"""
-    tips_dict = {
-        "TH": [
-            "Place tongue tip between upper and lower teeth",
-            "Blow air gently while keeping tongue in position",
-            "Should feel air flowing over tongue",
-        ],
-        "DH": [
-            "Same tongue position as TH",
-            "Add vocal cord vibration",
-            "Should feel buzzing in throat",
-        ],
-        "V": [
-            "Touch bottom lip to upper teeth",
-            "Voice while air flows through the gap",
-            "Don't use both lips like Vietnamese 'V'",
-        ],
-        "R": [
-            "Curl tongue without touching roof of mouth",
-            "Don't roll the R like in Vietnamese",
-            "Tongue should float freely",
-        ],
-        "L": [
-            "Touch tongue tip to roof of mouth behind teeth",
-            "Let air flow around sides of tongue",
-            "Make sure tongue actually touches",
-        ],
-        "Z": [
-            "Same tongue position as 'S'",
-            "Add vocal cord vibration",
-            "Should buzz like a bee",
-        ],
     }
-    return tips_dict.get(phoneme, ["Practice this sound slowly and clearly"])
-def get_word_pronunciation_tips(word: str, phonemes: List[str]) -> List[str]:
-    """Get word-specific pronunciation tips"""
-    tips = []
-    # Check for challenging combinations
-    phoneme_str = " ".join(phonemes)
-    # Consonant clusters
-    if "S T" in phoneme_str or "S K" in phoneme_str or "S P" in phoneme_str:
-        tips.append("Practice the consonant cluster slowly, then speed up")
-    # TH sounds
-    if "TH" in phonemes:
-        tips.append("Remember: tongue between teeth for TH sounds")
-    # R and L distinction
-    if "R" in phonemes and "L" in phonemes:
-        tips.append("Focus on R (no touching) vs L (tongue touches roof)")
-    # Final consonants (Vietnamese tendency to drop)
-    final_phoneme = phonemes[-1] if phonemes else ""
-    if final_phoneme in ["T", "D", "K", "G", "P", "B"]:
-        tips.append("Don't forget the final consonant sound")
-    # Vowel length
-    vowel_phonemes = [
-        p for p in phonemes if re.sub(r"[0-9]", "", p) in ["IY", "UW", "AO"]
-    ]
-    if vowel_phonemes:
-        tips.append("Make sure long vowels are actually longer")
-    if not tips:
-        tips.append("Break the word into syllables and practice each part")
-    return tips
-def get_text_recommendations(text_analysis: Dict) -> List[str]:
-    """Get recommendations based on text analysis"""
-    recommendations = []
-    difficulty = text_analysis["difficulty_score"]
-    if difficulty < 0.3:
-        recommendations.append(
-            "This text is good for beginners. Try adding more challenging words gradually."
-        )
-    elif difficulty > 0.8:
-        recommendations.append(
-            "This is very challenging text. Consider starting with easier words first."
-        )
-    challenging_sounds = text_analysis["challenging_sounds"]
-    if len(challenging_sounds) > 5:
-        recommendations.append(
-            "This text has many challenging sounds. Practice individual words first."
-        )
-    # Word length recommendations
-    long_words = [w for w in text_analysis["words"] if len(w["phonemes"]) > 8]
-    if long_words:
-        recommendations.append(
-            "Break down longer words into syllables for easier practice."
-        )
-    return recommendations
-# =============================================================================
-# ADDITIONAL ENDPOINTS
-# =============================================================================
-@router.get("/stats")
-async def get_system_stats():
-    """Get system statistics"""
-    cmu_dict = assessor.phoneme_processor.cmu_dict
-    return {
-        "total_words_supported": len(cmu_dict),
-        "phonemes_supported": len(assessor.phoneme_processor.phoneme_models),
-        "difficulty_levels": ["easy", "medium", "hard"],
-        "audio_formats_supported": ["wav", "mp3", "m4a", "flac"],
-        "max_audio_duration": "30 seconds",
-        "vietnamese_specific_features": True,
-        "features": [
-            "CMU Pronouncing Dictionary integration",
-            "IPA transcription",
-            "Syllable analysis",
-            "Contextual phoneme scoring",
-            "Vietnamese learner optimization",
-        ],
-    }
-@router.get("/phonemes/difficult")
-async def get_difficult_phonemes_for_vietnamese():
-    """Get phonemes that are most difficult for Vietnamese speakers"""
-    difficult_phonemes = []
-    for phoneme, difficulty in assessor.phoneme_processor.difficulty_map.items():
-        if difficulty > 0.6:  # Only include challenging ones
-            difficult_phonemes.append(
-                {
-                    "phoneme": phoneme,
-                    "difficulty": difficulty,
-                    "tips": get_phoneme_tips(phoneme),
-                    "example_words": get_example_words(phoneme),
-                }
-            )
-    # Sort by difficulty (hardest first)
-    difficult_phonemes.sort(key=lambda x: x["difficulty"], reverse=True)
-    return {
-        "difficult_phonemes": difficult_phonemes,
-        "total_count": len(difficult_phonemes),
-        "recommendation": "Focus on the top 5 most difficult sounds first",
-    }
-def get_example_words(phoneme: str) -> List[str]:
-    """Get example words containing the phoneme"""
-    examples = {
-        "TH": ["think", "three", "math", "path"],
-        "DH": ["this", "that", "mother", "weather"],
-        "V": ["very", "love", "give", "have"],
-        "Z": ["zoo", "zero", "buzz", "rise"],
-        "R": ["red", "car", "very", "right"],
-        "L": ["love", "hello", "well", "people"],
-        "W": ["water", "well", "what", "sweet"],
-        "ZH": ["measure", "vision", "treasure"],
-        "CH": ["chair", "much", "teach"],
-        "JH": ["job", "bridge", "age"],
-        "SH": ["shoe", "fish", "nation"],
-        "NG": ["ring", "thing", "young"],
-    }
-    return examples.get(phoneme, [f"word_with_{phoneme.lower()}"])

+# PRONUNCIATION ASSESSMENT USING WAV2VEC2PHONEME
+# Input: Audio + Reference Text → Output: Word highlights + Phoneme diff + Wrong words
+# Uses Wav2Vec2Phoneme for accurate phoneme-level transcription without language model correction
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException, APIRouter
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from typing import List, Dict, Optional
 import tempfile
 import os
 import numpy as np
 import librosa
 import nltk
 import eng_to_ipa as ipa
+import torch
 import re
 from collections import defaultdict
 import warnings
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2PhonemeCTCTokenizer
 warnings.filterwarnings("ignore")
 # Download required NLTK data
 try:
     nltk.download("cmudict", quiet=True)
     from nltk.corpus import cmudict
 except:
     print("Warning: NLTK data not available")
 # =============================================================================
 # MODELS
 # =============================================================================
+router = APIRouter(prefix="/pronunciation", tags=["Pronunciation"])
+class PronunciationAssessmentResult(BaseModel):
+    transcript: str  # What the user actually said (character transcript)
+    transcript_phonemes: str  # User's phonemes
+    user_phonemes: str  # Alias for transcript_phonemes for UI clarity
+    character_transcript: str
     overall_score: float
+    word_highlights: List[Dict]
+    phoneme_differences: List[Dict]
+    wrong_words: List[Dict]
     feedback: List[str]
+    processing_info: Dict
 # =============================================================================
+# WAV2VEC2 PHONEME ASR
 # =============================================================================
+class Wav2Vec2CharacterASR:
+    """Wav2Vec2 character-level ASR without language model correction"""
+    def __init__(self, model_name: str = "facebook/wav2vec2-base-960h"):
+        """
+        Initialize Wav2Vec2 character-level model
+        Available models:
+        - facebook/wav2vec2-large-960h-lv60-self (character-level, no LM)
+        - facebook/wav2vec2-base-960h (character-level, no LM)
+        - facebook/wav2vec2-large-960h (character-level, no LM)
+        """
+        print(f"Loading Wav2Vec2 character model: {model_name}")
+        try:
+            self.processor = Wav2Vec2Processor.from_pretrained(model_name)
+            self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
+            self.model.eval()
+            print("Wav2Vec2 character model loaded successfully")
+            self.model_name = model_name
+        except Exception as e:
+            print(f"Error loading model {model_name}: {e}")
+            # Fallback to base model
+            fallback_model = "facebook/wav2vec2-base-960h"
+            print(f"Trying fallback model: {fallback_model}")
+            try:
+                self.processor = Wav2Vec2Processor.from_pretrained(fallback_model)
+                self.model = Wav2Vec2ForCTC.from_pretrained(fallback_model)
+                self.model.eval()
+                self.model_name = fallback_model
+                print("Fallback model loaded successfully")
+            except Exception as e2:
+                raise Exception(f"Failed to load both models. Original error: {e}, Fallback error: {e2}")
+        self.sample_rate = 16000
+    def transcribe_to_characters(self, audio_path: str) -> Dict:
+        """
+        Transcribe audio directly to characters (no language model correction)
+        Returns raw character sequence as produced by the model
+        """
+        try:
+            # Load audio
+            speech, sr = librosa.load(audio_path, sr=self.sample_rate)
+            # Prepare input
+            input_values = self.processor(
+                speech,
+                sampling_rate=self.sample_rate,
+                return_tensors="pt"
+            ).input_values
+            # Get model predictions (no language model involved)
+            with torch.no_grad():
+                logits = self.model(input_values).logits
+                predicted_ids = torch.argmax(logits, dim=-1)
+            # Decode to characters directly
+            character_transcript = self.processor.batch_decode(predicted_ids)[0]
+            # Clean up character transcript
+            character_transcript = self._clean_character_transcript(character_transcript)
+            # Convert characters to phoneme-like representation
+            phoneme_like_transcript = self._characters_to_phoneme_representation(character_transcript)
+            return {
+                "character_transcript": character_transcript,
+                "phoneme_representation": phoneme_like_transcript,
+                "raw_predicted_ids": predicted_ids[0].tolist(),
+                "confidence_scores": torch.softmax(logits, dim=-1).max(dim=-1)[0][0].tolist()[:100]  # Limit for JSON
+            }
+        except Exception as e:
+            print(f"Transcription error: {e}")
+            return {
+                "character_transcript": "",
+                "phoneme_representation": "",
+                "raw_predicted_ids": [],
+                "confidence_scores": []
+            }
+    def _clean_character_transcript(self, transcript: str) -> str:
+        """Clean and standardize character transcript"""
+        # Remove extra spaces and special tokens
+        cleaned = re.sub(r'\s+', ' ', transcript)
+        cleaned = cleaned.strip().lower()
+        return cleaned
+    def _characters_to_phoneme_representation(self, text: str) -> str:
+        """Convert character-based transcript to phoneme-like representation for comparison"""
+        # This is a simple character-to-phoneme mapping for pronunciation comparison
+        # The idea is to convert the raw character output to something comparable with reference phonemes
+        if not text:
+            return ""
+        words = text.split()
+        phoneme_words = []
+        # Use our G2P to convert transcript words to phonemes
+        g2p = SimpleG2P()
+        for word in words:
+            try:
+                word_data = g2p.text_to_phonemes(word)[0]
+                phoneme_words.extend(word_data["phonemes"])
+            except:
+                # Fallback: simple letter-to-sound mapping
+                phoneme_words.extend(self._simple_letter_to_phoneme(word))
+        return " ".join(phoneme_words)
+    def _simple_letter_to_phoneme(self, word: str) -> List[str]:
+        """Simple fallback letter-to-phoneme conversion"""
+        letter_to_phoneme = {
+            'a': 'æ', 'b': 'b', 'c': 'k', 'd': 'd', 'e': 'ɛ',
+            'f': 'f', 'g': 'ɡ', 'h': 'h', 'i': 'ɪ', 'j': 'dʒ',
+            'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'o': 'ʌ',
+            'p': 'p', 'q': 'k', 'r': 'r', 's': 's', 't': 't',
+            'u': 'ʌ', 'v': 'v', 'w': 'w', 'x': 'ks', 'y': 'j', 'z': 'z'
+        }
+        phonemes = []
+        for letter in word.lower():
+            if letter in letter_to_phoneme:
+                phonemes.append(letter_to_phoneme[letter])
+        return phonemes
+# =============================================================================
+# SIMPLE G2P FOR REFERENCE
+# =============================================================================
+class SimpleG2P:
+    """Simple Grapheme-to-Phoneme converter for reference text"""
     def __init__(self):
         try:
             self.cmu_dict = cmudict.dict()
         except:
             self.cmu_dict = {}
             print("Warning: CMU dictionary not available")
+    def text_to_phonemes(self, text: str) -> List[Dict]:
+        """Convert text to phoneme sequence"""
+        words = self._clean_text(text).split()
+        phoneme_sequence = []
+        for word in words:
+            word_phonemes = self._get_word_phonemes(word)
+            phoneme_sequence.append({
+                "word": word,
+                "phonemes": word_phonemes,
+                "ipa": self._get_ipa(word),
+                "phoneme_string": " ".join(word_phonemes)
+            })
+        return phoneme_sequence
+    def get_reference_phoneme_string(self, text: str) -> str:
+        """Get reference phoneme string for comparison"""
+        phoneme_sequence = self.text_to_phonemes(text)
+        all_phonemes = []
+        for word_data in phoneme_sequence:
+            all_phonemes.extend(word_data["phonemes"])
+        return " ".join(all_phonemes)
+    def _clean_text(self, text: str) -> str:
+        """Clean text for processing"""
+        text = re.sub(r"[^\w\s\']", " ", text)
+        text = re.sub(r"\s+", " ", text)
+        return text.lower().strip()
+    def _get_word_phonemes(self, word: str) -> List[str]:
+        """Get phonemes for a word"""
+        word_lower = word.lower()
         if word_lower in self.cmu_dict:
+            # Remove stress markers and convert to Wav2Vec2 phoneme format
+            phonemes = self.cmu_dict[word_lower][0]
+            clean_phonemes = [re.sub(r"[0-9]", "", p) for p in phonemes]
+            return self._convert_to_wav2vec_format(clean_phonemes)
+        else:
+            return self._estimate_phonemes(word)
+    def _convert_to_wav2vec_format(self, cmu_phonemes: List[str]) -> List[str]:
+        """Convert CMU phonemes to Wav2Vec2 format"""
+        # Mapping from CMU to Wav2Vec2/eSpeak phonemes
+        cmu_to_espeak = {
+            "AA": "ɑ", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ",
+            "AY": "aɪ", "EH": "ɛ", "ER": "ɝ", "EY": "eɪ", "IH": "ɪ",
+            "IY": "i", "OW": "oʊ", "OY": "ɔɪ", "UH": "ʊ", "UW": "u",
+            "B": "b", "CH": "tʃ", "D": "d", "DH": "ð", "F": "f",
+            "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k", "L": "l",
+            "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "r",
+            "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v",
+            "W": "w", "Y": "j", "Z": "z", "ZH": "ʒ"
+        }
+        converted = []
+        for phoneme in cmu_phonemes:
+            converted_phoneme = cmu_to_espeak.get(phoneme, phoneme.lower())
+            converted.append(converted_phoneme)
+        return converted
+    def _get_ipa(self, word: str) -> str:
+        """Get IPA transcription"""
         try:
+            return ipa.convert(word)
         except:
+            return f"/{word}/"
     def _estimate_phonemes(self, word: str) -> List[str]:
         """Estimate phonemes for unknown words"""
+        # Basic phoneme estimation with eSpeak-style output
         phoneme_map = {
+            "ch": ["tʃ"], "sh": ["ʃ"], "th": ["θ"], "ph": ["f"],
+            "ck": ["k"], "ng": ["ŋ"], "qu": ["k", "w"],
+            "a": ["æ"], "e": ["ɛ"], "i": ["ɪ"], "o": ["ʌ"], "u": ["ʌ"],
+            "b": ["b"], "c": ["k"], "d": ["d"], "f": ["f"], "g": ["ɡ"],
+            "h": ["h"], "j": ["dʒ"], "k": ["k"], "l": ["l"], "m": ["m"],
+            "n": ["n"], "p": ["p"], "r": ["r"], "s": ["s"], "t": ["t"],
+            "v": ["v"], "w": ["w"], "x": ["k", "s"], "y": ["j"], "z": ["z"]
         }
         word = word.lower()
         phonemes = []
         i = 0
         while i < len(word):
             # Check 2-letter combinations first
+            if i <= len(word) - 2:
+                two_char = word[i:i+2]
                 if two_char in phoneme_map:
                     phonemes.extend(phoneme_map[two_char])
                     i += 2
                     continue
             # Single character
             char = word[i]
             if char in phoneme_map:
                 phonemes.extend(phoneme_map[char])
             i += 1
         return phonemes
 # =============================================================================
+# PHONEME COMPARATOR
 # =============================================================================
+class PhonemeComparator:
+    """Compare reference and learner phoneme sequences"""
     def __init__(self):
+        # Vietnamese speakers' common phoneme substitutions
+        self.substitution_patterns = {
+            "θ": ["f", "s", "t"],    # TH → F, S, T
+            "ð": ["d", "z", "v"],    # DH → D, Z, V
+            "v": ["w", "f"],         # V → W, F
+            "r": ["l"],              # R → L
+            "l": ["r"],              # L → R
+            "z": ["s"],              # Z → S
+            "ʒ": ["ʃ", "z"],         # ZH → SH, Z
+            "ŋ": ["n"],              # NG → N
         }
+        # Difficulty levels for Vietnamese speakers
+        self.difficulty_map = {
+            "θ": 0.9,  # th (think)
+            "ð": 0.9,  # th (this)
+            "v": 0.8,  # v
+            "z": 0.8,  # z
+            "ʒ": 0.9,  # zh (measure)
+            "r": 0.7,  # r
+            "l": 0.6,  # l
+            "w": 0.5,  # w
+            "f": 0.4,  # f
+            "s": 0.3,  # s
+            "ʃ": 0.5,  # sh
+            "tʃ": 0.4, # ch
+            "dʒ": 0.5, # j
+            "ŋ": 0.3,  # ng
         }
+    def compare_phoneme_sequences(self, reference_phonemes: str,
+                                 learner_phonemes: str) -> List[Dict]:
+        """Compare reference and learner phoneme sequences"""
+        # Split phoneme strings
+        ref_phones = reference_phonemes.split()
+        learner_phones = learner_phonemes.split()
+        print(f"Reference phonemes: {ref_phones}")
+        print(f"Learner phonemes: {learner_phones}")
+        # Simple alignment comparison
+        comparisons = []
+        max_len = max(len(ref_phones), len(learner_phones))
+        for i in range(max_len):
+            ref_phoneme = ref_phones[i] if i < len(ref_phones) else ""
+            learner_phoneme = learner_phones[i] if i < len(learner_phones) else ""
+            if ref_phoneme and learner_phoneme:
+                # Both present - check accuracy
+                if ref_phoneme == learner_phoneme:
+                    status = "correct"
+                    score = 1.0
+                elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
+                    status = "acceptable"
+                    score = 0.7
+                else:
+                    status = "wrong"
+                    score = 0.2
+            elif ref_phoneme and not learner_phoneme:
+                # Missing phoneme
+                status = "missing"
+                score = 0.0
+            elif learner_phoneme and not ref_phoneme:
+                # Extra phoneme
+                status = "extra"
+                score = 0.0
+            else:
+                continue
+            comparison = {
+                "position": i,
+                "reference_phoneme": ref_phoneme,
+                "learner_phoneme": learner_phoneme,
+                "status": status,
+                "score": score,
+                "difficulty": self.difficulty_map.get(ref_phoneme, 0.3)
             }
+            comparisons.append(comparison)
+        return comparisons
+    def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
+        """Check if learner phoneme is acceptable substitution for Vietnamese speakers"""
+        acceptable = self.substitution_patterns.get(reference, [])
+        return learner in acceptable
+# =============================================================================
+# WORD ANALYZER
+# =============================================================================
+class WordAnalyzer:
+    """Analyze word-level pronunciation accuracy using character-based ASR"""
+    def __init__(self):
+        self.g2p = SimpleG2P()
+        self.comparator = PhonemeComparator()
+    def analyze_words(self, reference_text: str, learner_phonemes: str) -> Dict:
+        """Analyze word-level pronunciation using phoneme representation from character ASR"""
+        # Get reference phonemes by word
+        reference_words = self.g2p.text_to_phonemes(reference_text)
+        # Get overall phoneme comparison
+        reference_phoneme_string = self.g2p.get_reference_phoneme_string(reference_text)
+        phoneme_comparisons = self.comparator.compare_phoneme_sequences(
+            reference_phoneme_string, learner_phonemes
         )
+        # Map phonemes back to words
+        word_highlights = self._create_word_highlights(reference_words, phoneme_comparisons)
+        # Identify wrong words
+        wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)
         return {
+            "word_highlights": word_highlights,
+            "phoneme_differences": phoneme_comparisons,
+            "wrong_words": wrong_words
         }
+    def _create_word_highlights(self, reference_words: List[Dict],
+                              phoneme_comparisons: List[Dict]) -> List[Dict]:
+        """Create word highlighting data"""
+        word_highlights = []
+        phoneme_index = 0
+        for word_data in reference_words:
+            word = word_data["word"]
+            word_phonemes = word_data["phonemes"]
+            num_phonemes = len(word_phonemes)
+            # Get phoneme scores for this word
+            word_phoneme_scores = []
+            for j in range(num_phonemes):
+                if phoneme_index + j < len(phoneme_comparisons):
+                    comparison = phoneme_comparisons[phoneme_index + j]
+                    word_phoneme_scores.append(comparison["score"])
+            # Calculate word score
+            word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
+            # Create word highlight
+            highlight = {
+                "word": word,
+                "score": float(word_score),
+                "status": self._get_word_status(word_score),
+                "color": self._get_word_color(word_score),
+                "phonemes": word_phonemes,
+                "ipa": word_data["ipa"],
+                "phoneme_scores": word_phoneme_scores,
+                "phoneme_start_index": phoneme_index,
+                "phoneme_end_index": phoneme_index + num_phonemes - 1
             }
+            word_highlights.append(highlight)
+            phoneme_index += num_phonemes
+        return word_highlights
+    def _identify_wrong_words(self, word_highlights: List[Dict],
+                            phoneme_comparisons: List[Dict]) -> List[Dict]:
+        """Identify words that were pronounced incorrectly"""
+        wrong_words = []
+        for word_highlight in word_highlights:
+            if word_highlight["score"] < 0.6:  # Threshold for wrong pronunciation
+                # Find specific phoneme errors for this word
+                start_idx = word_highlight["phoneme_start_index"]
+                end_idx = word_highlight["phoneme_end_index"]
+                wrong_phonemes = []
+                missing_phonemes = []
+                for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
+                    comparison = phoneme_comparisons[i]
+                    if comparison["status"] == "wrong":
+                        wrong_phonemes.append({
+                            "expected": comparison["reference_phoneme"],
+                            "actual": comparison["learner_phoneme"],
+                            "difficulty": comparison["difficulty"]
+                        })
+                    elif comparison["status"] == "missing":
+                        missing_phonemes.append({
+                            "phoneme": comparison["reference_phoneme"],
+                            "difficulty": comparison["difficulty"]
+                        })
+                wrong_word = {
+                    "word": word_highlight["word"],
+                    "score": word_highlight["score"],
+                    "expected_phonemes": word_highlight["phonemes"],
+                    "ipa": word_highlight["ipa"],
+                    "wrong_phonemes": wrong_phonemes,
+                    "missing_phonemes": missing_phonemes,
+                    "tips": self._get_vietnamese_tips(wrong_phonemes, missing_phonemes)
                 }
+                wrong_words.append(wrong_word)
+        return wrong_words
     def _get_word_status(self, score: float) -> str:
         """Get word status from score"""
         if score >= 0.8:
             return "needs_practice"
         else:
             return "poor"
+    def _get_word_color(self, score: float) -> str:
+        """Get color for word highlighting"""
+        if score >= 0.8:
+            return "#22c55e"  # Green
+        elif score >= 0.6:
+            return "#84cc16"  # Light green
+        elif score >= 0.4:
+            return "#eab308"  # Yellow
+        else:
+            return "#ef4444"  # Red
+    def _get_vietnamese_tips(self, wrong_phonemes: List[Dict],
+                           missing_phonemes: List[Dict]) -> List[str]:
+        """Get Vietnamese-specific pronunciation tips"""
+        tips = []
+        # Tips for specific Vietnamese pronunciation challenges
+        vietnamese_tips = {
+            "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
+            "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
+            "v": "Chạm môi dưới vào răng trên, không dùng cả hai môi như tiếng Việt",
+            "r": "Cuộn lưỡi nhưng không chạm vào vòm miệng, không lăn lưỡi",
+            "l": "Đầu lưỡi chạm vào vòm miệng sau răng",
+            "z": "Giống âm 's' nhưng có rung dây thanh âm",
+            "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
+            "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'"
+        }
+        # Add tips for wrong phonemes
+        for wrong in wrong_phonemes:
+            expected = wrong["expected"]
+            actual = wrong["actual"]
+            if expected in vietnamese_tips:
+                tips.append(f"Âm '{expected}': {vietnamese_tips[expected]}")
+            else:
+                tips.append(f"Luyện âm '{expected}' thay vì '{actual}'")
+        # Add tips for missing phonemes
+        for missing in missing_phonemes:
+            phoneme = missing["phoneme"]
+            if phoneme in vietnamese_tips:
+                tips.append(f"Thiếu âm '{phoneme}': {vietnamese_tips[phoneme]}")
+        return tips
+# =============================================================================
+# FEEDBACK GENERATOR
+# =============================================================================
+class SimpleFeedbackGenerator:
+    """Generate simple, actionable feedback in Vietnamese"""
+    def generate_feedback(self, overall_score: float, wrong_words: List[Dict],
+                         phoneme_comparisons: List[Dict]) -> List[str]:
+        """Generate Vietnamese feedback"""
+        feedback = []
+        # Overall feedback in Vietnamese
+        if overall_score >= 0.8:
+            feedback.append("Phát âm rất tốt! Bạn đã làm xuất sắc.")
+        elif overall_score >= 0.6:
+            feedback.append("Phát âm khá tốt, còn một vài điểm cần cải thiện.")
+        elif overall_score >= 0.4:
+            feedback.append("Cần luyện tập thêm. Tập trung vào những từ được đánh dấu đỏ.")
+        else:
+            feedback.append("Hãy luyện tập chậm và rõ ràng hơn.")
+        # Wrong words feedback
+        if wrong_words:
+            if len(wrong_words) <= 3:
+                word_names = [w["word"] for w in wrong_words]
+                feedback.append(f"Các từ cần luyện tập: {', '.join(word_names)}")
+            else:
+                feedback.append(f"Có {len(wrong_words)} từ cần luyện tập. Tập trung vào từng từ một.")
+        # Most problematic phonemes
+        problem_phonemes = defaultdict(int)
+        for comparison in phoneme_comparisons:
+            if comparison["status"] in ["wrong", "missing"]:
+                phoneme = comparison["reference_phoneme"]
+                problem_phonemes[phoneme] += 1
+        if problem_phonemes:
+            most_difficult = sorted(problem_phonemes.items(), key=lambda x: x[1], reverse=True)
+            top_problem = most_difficult[0][0]
+            phoneme_tips = {
+                "θ": "Lưỡi giữa răng, thổi nhẹ",
+                "ð": "Lưỡi giữa răng, rung dây thanh",
+                "v": "Môi dưới chạm răng trên",
+                "r": "Cuộn lưỡi, không chạm vòm miệng",
+                "l": "Lưỡi chạm vòm miệng",
+                "z": "Như 's' nhưng rung dây thanh"
+            }
+            if top_problem in phoneme_tips:
+                feedback.append(f"Âm khó nhất '{top_problem}': {phoneme_tips[top_problem]}")
+        return feedback
 # =============================================================================
+# MAIN PRONUNCIATION ASSESSOR
 # =============================================================================
+class SimplePronunciationAssessor:
+    """Main pronunciation assessor using Wav2Vec2 character-level model"""
+    def __init__(self):
+        print("Initializing Simple Pronunciation Assessor...")
+        self.asr = Wav2Vec2CharacterASR()  # Updated to use character-based ASR
+        self.word_analyzer = WordAnalyzer()
+        self.feedback_generator = SimpleFeedbackGenerator()
+        print("Initialization completed")
+    def assess_pronunciation(self, audio_path: str, reference_text: str) -> Dict:
+        """
+        Main assessment function
+        Input: Audio path + Reference text
+        Output: Word highlights + Phoneme differences + Wrong words
+        """
+        print("Starting pronunciation assessment...")
+        # Step 1: Wav2Vec2 character transcription (no language model)
+        print("Step 1: Transcribing to characters...")
+        asr_result = self.asr.transcribe_to_characters(audio_path)
+        character_transcript = asr_result["character_transcript"]
+        phoneme_representation = asr_result["phoneme_representation"]
+        print(f"Character transcript: {character_transcript}")
+        print(f"Phoneme representation: {phoneme_representation}")
+        # Step 2: Word analysis using phoneme representation
+        print("Step 2: Analyzing words...")
+        analysis_result = self.word_analyzer.analyze_words(reference_text, phoneme_representation)
+        # Step 3: Calculate overall score
+        phoneme_comparisons = analysis_result["phoneme_differences"]
+        overall_score = self._calculate_overall_score(phoneme_comparisons)
+        # Step 4: Generate feedback
+        print("Step 3: Generating feedback...")
+        feedback = self.feedback_generator.generate_feedback(
+            overall_score, analysis_result["wrong_words"], phoneme_comparisons
+        )
+        result = {
+            "transcript": character_transcript,  # What user actually said
+            "transcript_phonemes": phoneme_representation,
+            "user_phonemes": phoneme_representation,  # Alias for UI clarity
+            "character_transcript": character_transcript,
+            "overall_score": overall_score,
+            "word_highlights": analysis_result["word_highlights"],
+            "phoneme_differences": phoneme_comparisons,
+            "wrong_words": analysis_result["wrong_words"],
+            "feedback": feedback,
+            "processing_info": {
+                "model_used": f"Wav2Vec2-Character ({self.asr.model_name})",
+                "character_based": True,
+                "language_model_correction": False,
+                "raw_output": True
+            }
+        }
+        print("Assessment completed successfully")
+        return result
+    def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
+        """Calculate overall pronunciation score"""
+        if not phoneme_comparisons:
+            return 0.0
+        total_score = sum(comparison["score"] for comparison in phoneme_comparisons)
+        return total_score / len(phoneme_comparisons)
 # =============================================================================
+# API ENDPOINT
 # =============================================================================
+# Initialize assessor
+assessor = SimplePronunciationAssessor()
+def convert_numpy_types(obj):
+    """Convert numpy types to Python native types"""
+    if isinstance(obj, np.integer):
+        return int(obj)
+    elif isinstance(obj, np.floating):
+        return float(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {key: convert_numpy_types(value) for key, value in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_numpy_types(item) for item in obj]
+    else:
+        return obj
+@router.post("/assess", response_model=PronunciationAssessmentResult)
 async def assess_pronunciation(
+    audio: UploadFile = File(..., description="Audio file (.wav, .mp3, .m4a)"),
+    reference_text: str = Form(..., description="Reference text to pronounce")
 ):
     """
+    Pronunciation Assessment API using Wav2Vec2 Character-level Model
+    Key Features:
+    - Uses facebook/wav2vec2-large-960h-lv60-self for character transcription
+    - NO language model correction (shows actual pronunciation errors)
+    - Character-level accuracy converted to phoneme representation
+    - Vietnamese-optimized feedback and tips
+    Input: Audio file + Reference text
+    Output: Word highlights + Phoneme differences + Wrong words
     """
     import time
     start_time = time.time()
     # Validate inputs
     if not reference_text.strip():
         raise HTTPException(status_code=400, detail="Reference text cannot be empty")
+    if len(reference_text) > 500:
+        raise HTTPException(status_code=400, detail="Reference text too long (max 500 characters)")
+    # Check for valid English characters
     if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text):
         raise HTTPException(
             status_code=400,
+            detail="Text must contain only English letters, spaces, and basic punctuation"
         )
     try:
+        # Save uploaded file temporarily
         file_extension = ".wav"
+        if audio.filename and "." in audio.filename:
+            file_extension = f".{audio.filename.split('.')[-1]}"
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp_file:
             content = await audio.read()
             tmp_file.write(content)
             tmp_file.flush()
+            print(f"Processing audio file: {tmp_file.name}")
+            # Run assessment using Wav2Vec2 Character model
+            result = assessor.assess_pronunciation(tmp_file.name, reference_text)
+        # Add processing time
         processing_time = time.time() - start_time
+        result["processing_info"]["processing_time"] = processing_time
+        # Convert numpy types for JSON serialization
+        final_result = convert_numpy_types(result)
+        print(f"Assessment completed in {processing_time:.2f} seconds")
+        return PronunciationAssessmentResult(**final_result)
     except Exception as e:
+        print(f"Assessment error: {str(e)}")
         import traceback
         traceback.print_exc()
+        raise HTTPException(status_code=500, detail=f"Assessment failed: {str(e)}")
+# =============================================================================
+# UTILITY ENDPOINTS
+# =============================================================================
 @router.get("/phonemes/{word}")
 async def get_word_phonemes(word: str):
+    """Get phoneme breakdown for a specific word"""
     try:
+        g2p = SimpleG2P()
+        phoneme_data = g2p.text_to_phonemes(word)[0]
+        # Add difficulty analysis for Vietnamese speakers
+        difficulty_scores = []
+        comparator = PhonemeComparator()
+        for phoneme in phoneme_data["phonemes"]:
+            difficulty = comparator.difficulty_map.get(phoneme, 0.3)
+            difficulty_scores.append(difficulty)
+        avg_difficulty = float(np.mean(difficulty_scores)) if difficulty_scores else 0.3
         return {
             "word": word,
+            "phonemes": phoneme_data["phonemes"],
+            "phoneme_string": phoneme_data["phoneme_string"],
+            "ipa": phoneme_data["ipa"],
+            "difficulty_score": avg_difficulty,
+            "difficulty_level": "hard" if avg_difficulty > 0.6 else "medium" if avg_difficulty > 0.4 else "easy",
+            "challenging_phonemes": [
+                {
+                    "phoneme": p,
+                    "difficulty": comparator.difficulty_map.get(p, 0.3),
+                    "vietnamese_tip": get_vietnamese_tip(p)
+                }
+                for p in phoneme_data["phonemes"]
+                if comparator.difficulty_map.get(p, 0.3) > 0.6
+            ]
         }
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Word analysis error: {str(e)}")
+@router.get("/health")
+async def health_check():
+    """Health check endpoint"""
     try:
+        model_info = {
+            "status": "healthy",
+            "model": assessor.asr.model_name,
+            "character_based": True,
+            "language_model_correction": False,
+            "vietnamese_optimized": True
         }
+        return model_info
     except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e)
+        }
+@router.get("/test-model")
+async def test_model():
+    """Test if Wav2Vec2 model is working"""
     try:
+        # Test model info
+        test_result = {
+            "model_loaded": True,
+            "model_name": assessor.asr.model_name,
+            "processor_ready": True,
+            "sample_rate": assessor.asr.sample_rate,
+            "sample_characters": "this is a test",
+            "sample_phonemes": "ðɪs ɪz ə tɛst"
+        }
+        return test_result
     except Exception as e:
         return {
+            "model_loaded": False,
+            "error": str(e)
         }
 # =============================================================================
 # HELPER FUNCTIONS
 # =============================================================================
+def get_vietnamese_tip(phoneme: str) -> str:
+    """Get Vietnamese pronunciation tip for a phoneme"""
+    tips = {
+        "θ": "Đặt lưỡi giữa răng, thổi nhẹ",
+        "ð": "Giống θ nhưng rung dây thanh âm",
+        "v": "Môi dưới chạm răng trên",
+        "r": "Cuộn lưỡi, không chạm vòm miệng",
+        "l": "Lưỡi chạm vòm miệng sau răng",
+        "z": "Như 's' nhưng rung dây thanh",
+        "ʒ": "Như 'ʃ' nhưng rung dây thanh",
+        "w": "Tròn môi như 'u'"
     }
+    return tips.get(phoneme, f"Luyện âm {phoneme}")