Spaces:

ABAO77
/

Run_code_api

Sleeping

App Files Files Community

ABAO77 commited on Sep 1

Commit

dd47219

1 Parent(s): 9c76eb3

Implement feature X to enhance user experience and fix bug Y in module Z

Browse files

Files changed (1) hide show

src/apis/routes/speaking_route.py +1381 -467

src/apis/routes/speaking_route.py CHANGED Viewed

@@ -1,16 +1,19 @@
-# SIMPLIFIED PRONUNCIATION ASSESSMENT API
-# Input: Audio + Reference Text → Output: Word highlights + Phoneme diff + Wrong words
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException, APIRouter
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from typing import List, Dict, Optional
 import tempfile
 import os
 import numpy as np
 import nltk
 import eng_to_ipa as ipa
-import whisper
 import re
 from collections import defaultdict
 import warnings
@@ -20,6 +23,7 @@ warnings.filterwarnings("ignore")
 # Download required NLTK data
 try:
     nltk.download("cmudict", quiet=True)
     from nltk.corpus import cmudict
 except:
     print("Warning: NLTK data not available")
@@ -27,74 +31,147 @@ except:
 # =============================================================================
 # MODELS
 # =============================================================================
-router = APIRouter(prefix="/pronunciation", tags=["Pronunciation"])
-class PronunciationAssessmentResult(BaseModel):
-    transcript: str
     overall_score: float
-    word_highlights: List[Dict]
-    phoneme_differences: List[Dict]
-    wrong_words: List[Dict]
     feedback: List[str]
 # =============================================================================
-# CORE COMPONENTS
 # =============================================================================
-class SimpleG2P:
-    """Simple Grapheme-to-Phoneme converter"""
     def __init__(self):
         try:
             self.cmu_dict = cmudict.dict()
         except:
             self.cmu_dict = {}
             print("Warning: CMU dictionary not available")
-    def text_to_phonemes(self, text: str) -> List[Dict]:
-        """Convert text to phoneme sequence"""
-        words = self._clean_text(text).split()
-        phoneme_sequence = []
-        for word in words:
-            word_phonemes = self._get_word_phonemes(word)
-            phoneme_sequence.append(
-                {"word": word, "phonemes": word_phonemes, "ipa": self._get_ipa(word)}
-            )
-        return phoneme_sequence
-    def _clean_text(self, text: str) -> str:
-        """Clean text for processing"""
-        text = re.sub(r"[^\w\s\']", " ", text)
-        text = re.sub(r"\s+", " ", text)
-        return text.lower().strip()
-    def _get_word_phonemes(self, word: str) -> List[str]:
-        """Get phonemes for a word"""
-        word_lower = word.lower()
         if word_lower in self.cmu_dict:
-            # Remove stress markers
-            phonemes = self.cmu_dict[word_lower][0]
-            return [re.sub(r"[0-9]", "", p) for p in phonemes]
-        else:
-            # Simple fallback
-            return self._estimate_phonemes(word)
-    def _get_ipa(self, word: str) -> str:
-        """Get IPA transcription"""
         try:
-            return ipa.convert(word)
         except:
-            return f"/{word}/"
     def _estimate_phonemes(self, word: str) -> List[str]:
         """Estimate phonemes for unknown words"""
         phoneme_map = {
             "ch": ["CH"],
             "sh": ["SH"],
@@ -136,7 +213,7 @@ class SimpleG2P:
         while i < len(word):
             # Check 2-letter combinations first
-            if i <= len(word) - 2:
                 two_char = word[i : i + 2]
                 if two_char in phoneme_map:
                     phonemes.extend(phoneme_map[two_char])
@@ -152,343 +229,921 @@ class SimpleG2P:
         return phonemes
-class SimplePhonemeComparator:
-    """Simple phoneme comparison"""
-    def __init__(self):
-        # Vietnamese difficulty map
-        self.difficulty_map = {
-            "TH": 0.9,
-            "DH": 0.9,
-            "V": 0.8,
-            "Z": 0.8,
-            "ZH": 0.9,
-            "R": 0.7,
-            "L": 0.6,
-            "W": 0.5,
-            "F": 0.4,
-            "S": 0.3,
-            "SH": 0.5,
-            "CH": 0.4,
-            "JH": 0.5,
-            "NG": 0.3,
         }
-        # Common substitution patterns for Vietnamese speakers
-        self.substitution_patterns = {
-            "TH": ["F", "S", "T"],
-            "DH": ["D", "Z", "V"],
-            "V": ["W", "F"],
-            "R": ["L"],
-            "L": ["R"],
-            "Z": ["S"],
         }
-    def compare_phonemes(
-        self, reference_phonemes: List[Dict], learner_phonemes: List[Dict]
-    ) -> List[Dict]:
-        """Compare reference and learner phoneme sequences"""
-        # Flatten phoneme sequences
-        ref_sequence = []
-        learner_sequence = []
-        for word_data in reference_phonemes:
-            for phoneme in word_data["phonemes"]:
-                ref_sequence.append({"phoneme": phoneme, "word": word_data["word"]})
-        for word_data in learner_phonemes:
-            for phoneme in word_data["phonemes"]:
-                learner_sequence.append({"phoneme": phoneme, "word": word_data["word"]})
-        # Simple alignment and comparison
-        comparisons = []
-        max_len = max(len(ref_sequence), len(learner_sequence))
-        for i in range(max_len):
-            ref_item = ref_sequence[i] if i < len(ref_sequence) else None
-            learner_item = learner_sequence[i] if i < len(learner_sequence) else None
-            if ref_item and learner_item:
-                ref_phoneme = ref_item["phoneme"]
-                learner_phoneme = learner_item["phoneme"]
-                if ref_phoneme == learner_phoneme:
-                    status = "correct"
-                    score = 1.0
-                elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
-                    status = "acceptable"
-                    score = 0.7
-                else:
-                    status = "wrong"
-                    score = 0.3
-                comparisons.append(
-                    {
-                        "position": i,
-                        "reference_phoneme": ref_phoneme,
-                        "learner_phoneme": learner_phoneme,
-                        "status": status,
-                        "score": score,
-                        "word": ref_item["word"],
-                        "difficulty": self.difficulty_map.get(ref_phoneme, 0.3),
-                    }
-                )
-            elif ref_item and not learner_item:
-                # Missing phoneme
-                comparisons.append(
-                    {
-                        "position": i,
-                        "reference_phoneme": ref_item["phoneme"],
-                        "learner_phoneme": "",
-                        "status": "missing",
-                        "score": 0.0,
-                        "word": ref_item["word"],
-                        "difficulty": self.difficulty_map.get(ref_item["phoneme"], 0.3),
-                    }
-                )
-            elif learner_item and not ref_item:
-                # Extra phoneme
-                comparisons.append(
-                    {
-                        "position": i,
-                        "reference_phoneme": "",
-                        "learner_phoneme": learner_item["phoneme"],
-                        "status": "extra",
-                        "score": 0.0,
-                        "word": learner_item["word"],
-                        "difficulty": 0.3,
-                    }
-                )
-        return comparisons
-    def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
-        """Check if substitution is acceptable for Vietnamese speakers"""
-        acceptable = self.substitution_patterns.get(reference, [])
-        return learner in acceptable
-class SimplePronunciationAssessor:
-    """Simplified pronunciation assessor focused on core functionality"""
     def __init__(self):
-        print("Initializing Whisper model...")
-        self.whisper_model = whisper.load_model("base.en", in_memory=True)
-        print("Whisper model loaded successfully")
-        self.g2p = SimpleG2P()
-        self.comparator = SimplePhonemeComparator()
         self.sample_rate = 16000
-    def assess_pronunciation(self, audio_path: str, reference_text: str) -> Dict:
-        """Main assessment function"""
-        # Step 1: Whisper ASR
-        print("Running Whisper transcription...")
-        asr_result = self.whisper_model.transcribe(audio_path)
-        transcript = asr_result["text"].strip()
-        print(f"Transcript: '{transcript}'")
-        # Step 2: Get reference phonemes
-        print("Getting reference phonemes...")
-        reference_phonemes = self.g2p.text_to_phonemes(reference_text)
-        # Step 3: Get learner phonemes from transcript
-        print("Getting learner phonemes...")
-        learner_phonemes = self.g2p.text_to_phonemes(transcript)
-        # Step 4: Compare phonemes
-        print("Comparing phonemes...")
-        phoneme_comparisons = self.comparator.compare_phonemes(
-            reference_phonemes, learner_phonemes
         )
-        # Step 5: Generate word highlights
-        print("Generating word highlights...")
-        word_highlights = self._generate_word_highlights(
-            reference_phonemes, learner_phonemes, phoneme_comparisons
         )
-        # Step 6: Identify wrong words
-        print("Identifying wrong words...")
-        wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)
-        # Step 7: Calculate overall score
-        overall_score = self._calculate_overall_score(phoneme_comparisons)
-        # Step 8: Generate feedback
-        feedback = self._generate_simple_feedback(
-            overall_score, wrong_words, phoneme_comparisons
         )
         return {
-            "transcript": transcript,
             "overall_score": overall_score,
-            "word_highlights": word_highlights,
-            "phoneme_differences": phoneme_comparisons,
-            "wrong_words": wrong_words,
             "feedback": feedback,
         }
-    def _generate_word_highlights(
         self,
-        reference_phonemes: List[Dict],
-        learner_phonemes: List[Dict],
-        phoneme_comparisons: List[Dict],
-    ) -> List[Dict]:
-        """Generate word highlighting data"""
-        word_highlights = []
-        # Group comparisons by word
-        word_scores = defaultdict(list)
-        for comparison in phoneme_comparisons:
-            word = comparison.get("word", "unknown")
-            if comparison["status"] in ["correct", "acceptable", "wrong"]:
-                word_scores[word].append(comparison["score"])
-        # Create highlights for reference words
-        for word_data in reference_phonemes:
-            word = word_data["word"]
-            scores = word_scores.get(word, [0.0])
-            avg_score = float(np.mean(scores))
-            highlight = {
-                "word": word,
-                "score": avg_score,
-                "status": self._get_word_status(avg_score),
-                "color": self._get_word_color(avg_score),
-                "phonemes": word_data["phonemes"],
-                "ipa": word_data["ipa"],
-                "issues": self._get_word_issues(word, phoneme_comparisons),
             }
-            word_highlights.append(highlight)
-        return word_highlights
-    def _identify_wrong_words(
-        self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
-    ) -> List[Dict]:
-        """Identify words that were pronounced incorrectly"""
-        wrong_words = []
-        for word_highlight in word_highlights:
-            if word_highlight["score"] < 0.6:  # Threshold for "wrong"
-                word = word_highlight["word"]
-                # Find specific issues for this word
-                word_issues = []
-                wrong_phonemes = []
-                missing_phonemes = []
-                for comparison in phoneme_comparisons:
-                    if comparison.get("word") == word:
-                        if comparison["status"] == "wrong":
-                            wrong_phonemes.append(
-                                {
-                                    "expected": comparison["reference_phoneme"],
-                                    "actual": comparison["learner_phoneme"],
-                                }
-                            )
-                        elif comparison["status"] == "missing":
-                            missing_phonemes.append(comparison["reference_phoneme"])
-                if wrong_phonemes:
-                    word_issues.append(
-                        f"Wrong sounds: {', '.join([p['expected'] for p in wrong_phonemes])}"
-                    )
-                if missing_phonemes:
-                    word_issues.append(f"Missing sounds: {', '.join(missing_phonemes)}")
-                wrong_word = {
-                    "word": word,
-                    "score": word_highlight["score"],
-                    "expected_phonemes": word_highlight["phonemes"],
-                    "ipa": word_highlight["ipa"],
-                    "issues": word_issues,
-                    "wrong_phonemes": wrong_phonemes,
-                    "missing_phonemes": missing_phonemes,
-                    "tips": self._get_pronunciation_tips(
-                        word, wrong_phonemes, missing_phonemes
                     ),
                 }
-                wrong_words.append(wrong_word)
-        return wrong_words
-    def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
-        """Calculate overall pronunciation score"""
-        if not phoneme_comparisons:
-            return 0.0
-        total_score = 0.0
-        for comparison in phoneme_comparisons:
-            total_score += comparison["score"]
-        return total_score / len(phoneme_comparisons)
-    def _generate_simple_feedback(
         self,
         overall_score: float,
-        wrong_words: List[Dict],
-        phoneme_comparisons: List[Dict],
     ) -> List[str]:
-        """Generate simple, actionable feedback"""
         feedback = []
-        # Overall feedback
-        if overall_score >= 0.8:
-            feedback.append("Phát âm tốt! Bạn đã làm rất tốt.")
-        elif overall_score >= 0.6:
-            feedback.append("Phát âm khá tốt, còn một vài điểm cần cải thiện.")
-        elif overall_score >= 0.4:
             feedback.append(
-                "Cần luyện tập thêm. Tập trung vào những từ được đánh dấu đỏ."
             )
         else:
-            feedback.append("Hãy luyện tập chậm và rõ ràng hơn.")
-        # Wrong words feedback
-        if wrong_words:
-            word_names = [w["word"] for w in wrong_words[:3]]
-            feedback.append(f"Các từ cần luyện tập: {', '.join(word_names)}")
         # Phoneme-specific feedback for Vietnamese speakers
-        problem_phonemes = defaultdict(int)
-        for comparison in phoneme_comparisons:
-            if comparison["status"] == "wrong":
-                phoneme = comparison["reference_phoneme"]
-                problem_phonemes[phoneme] += 1
-        # Vietnamese-specific tips for most problematic sounds
-        vietnamese_tips = {
-            "TH": "Đặt lưỡi giữa răng, thổi nhẹ",
-            "DH": "Giống TH nhưng rung dây thanh",
-            "V": "Chạm môi dưới vào răng trên",
-            "R": "Cuộn lưỡi, không chạm vòm miệng",
-            "L": "Đầu lưỡi chạm vòm miệng",
-            "Z": "Giống S nhưng có rung dây thanh",
-        }
-        if problem_phonemes:
-            most_difficult = sorted(
-                problem_phonemes.items(), key=lambda x: x[1], reverse=True
             )
-            for phoneme, count in most_difficult[:2]:
-                if phoneme in vietnamese_tips:
-                    feedback.append(f"Âm {phoneme}: {vietnamese_tips[phoneme]}")
         return feedback
     def _get_word_status(self, score: float) -> str:
         """Get word status from score"""
         if score >= 0.8:
@@ -500,215 +1155,474 @@ class SimplePronunciationAssessor:
         else:
             return "poor"
-    def _get_word_color(self, score: float) -> str:
-        """Get color for word highlighting"""
-        if score >= 0.8:
-            return "#22c55e"  # Green
-        elif score >= 0.6:
-            return "#84cc16"  # Light green
-        elif score >= 0.4:
-            return "#eab308"  # Yellow
-        else:
-            return "#ef4444"  # Red
-    def _get_word_issues(self, word: str, phoneme_comparisons: List[Dict]) -> List[str]:
-        """Get specific issues for a word"""
-        issues = []
-        word_comparisons = [c for c in phoneme_comparisons if c.get("word") == word]
-        wrong_count = len([c for c in word_comparisons if c["status"] == "wrong"])
-        missing_count = len([c for c in word_comparisons if c["status"] == "missing"])
-        if wrong_count > 0:
-            issues.append(f"{wrong_count} sai âm")
-        if missing_count > 0:
-            issues.append(f"{missing_count} thiếu âm")
-        return issues
-    def _get_pronunciation_tips(
-        self, word: str, wrong_phonemes: List[Dict], missing_phonemes: List[str]
-    ) -> List[str]:
-        """Get pronunciation tips for wrong words"""
-        tips = []
-        # Tips for specific problematic phonemes
-        phoneme_tips = {
-            "TH": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ",
-            "DH": "Giống TH nhưng rung dây thanh âm",
-            "V": "Chạm môi dưới vào răng trên, không dùng cả hai môi",
-            "R": "Cuộn lưỡi nhưng không chạm vào vòm miệng",
-            "L": "Đầu lưỡi chạm vào vòm miệng sau răng",
-            "Z": "Giống âm S nhưng có rung dây thanh âm",
-        }
-        # Add tips for wrong phonemes
-        for wrong in wrong_phonemes:
-            expected = wrong["expected"]
-            if expected in phoneme_tips:
-                tips.append(f"Âm {expected}: {phoneme_tips[expected]}")
-        # Add tips for missing phonemes
-        for missing in missing_phonemes:
-            if missing in phoneme_tips:
-                tips.append(f"Thiếu âm {missing}: {phoneme_tips[missing]}")
-        # General tip if no specific tips
-        if not tips:
-            tips.append(f"Luyện tập từ '{word}' chậm và rõ ràng")
-        return tips
 # =============================================================================
-# MAIN API ENDPOINT
 # =============================================================================
-# Initialize assessor
-assessor = SimplePronunciationAssessor()
-def convert_numpy_types(obj):
-    """Convert numpy types to Python native types"""
-    if isinstance(obj, np.integer):
-        return int(obj)
-    elif isinstance(obj, np.floating):
-        return float(obj)
-    elif isinstance(obj, np.ndarray):
-        return obj.tolist()
-    elif isinstance(obj, dict):
-        return {key: convert_numpy_types(value) for key, value in obj.items()}
-    elif isinstance(obj, list):
-        return [convert_numpy_types(item) for item in obj]
-    else:
-        return obj
-@router.post("/assess", response_model=PronunciationAssessmentResult)
 async def assess_pronunciation(
-    audio: UploadFile = File(..., description="Audio file (.wav, .mp3, .m4a)"),
-    reference_text: str = Form(..., description="Reference text to compare against"),
 ):
     """
-    Main API: Pronunciation Assessment
-    Input: Audio file + Reference text
-    Output: Word highlights + Phoneme differences + Wrong words
-    Features:
-    - Whisper ASR for transcript
-    - CMU Dict phoneme mapping
-    - Vietnamese-optimized comparison
-    - Simple UI-ready output
     """
     import time
     start_time = time.time()
     # Validate inputs
     if not reference_text.strip():
         raise HTTPException(status_code=400, detail="Reference text cannot be empty")
-    if len(reference_text) > 500:
         raise HTTPException(
-            status_code=400, detail="Reference text too long (max 500 characters)"
         )
-    # Check for valid English characters
     if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text):
         raise HTTPException(
             status_code=400,
-            detail="Text must contain only English letters, spaces, and basic punctuation",
         )
     try:
-        # Save uploaded file temporarily
         file_extension = ".wav"
-        if audio.filename and "." in audio.filename:
-            file_extension = f".{audio.filename.split('.')[-1]}"
         with tempfile.NamedTemporaryFile(
             delete=False, suffix=file_extension
         ) as tmp_file:
             content = await audio.read()
             tmp_file.write(content)
             tmp_file.flush()
-            print(f"Processing audio file: {tmp_file.name}")
-            # Run assessment
-            result = assessor.assess_pronunciation(tmp_file.name, reference_text)
-            # Clean up temporary file
             os.unlink(tmp_file.name)
-        # Convert numpy types for JSON serialization
-        final_result = convert_numpy_types(result)
         processing_time = time.time() - start_time
-        print(f"Assessment completed in {processing_time:.2f} seconds")
-        return PronunciationAssessmentResult(**final_result)
     except Exception as e:
-        print(f"Assessment error: {str(e)}")
         import traceback
         traceback.print_exc()
-        raise HTTPException(status_code=500, detail=f"Assessment failed: {str(e)}")
-# =============================================================================
-# UTILITY ENDPOINTS
-# =============================================================================
 @router.get("/phonemes/{word}")
 async def get_word_phonemes(word: str):
-    """Get phoneme breakdown for a specific word"""
     try:
-        phoneme_data = assessor.g2p.text_to_phonemes(word)[0]
-        # Add difficulty analysis
-        difficulty_scores = []
-        for phoneme in phoneme_data["phonemes"]:
-            difficulty = assessor.comparator.difficulty_map.get(phoneme, 0.3)
-            difficulty_scores.append(difficulty)
-        avg_difficulty = float(np.mean(difficulty_scores)) if difficulty_scores else 0.3
         return {
             "word": word,
-            "phonemes": phoneme_data["phonemes"],
-            "ipa": phoneme_data["ipa"],
-            "difficulty_score": avg_difficulty,
             "difficulty_level": (
                 "hard"
-                if avg_difficulty > 0.6
-                else "medium" if avg_difficulty > 0.4 else "easy"
             ),
-            "challenging_phonemes": [
-                {
-                    "phoneme": p,
-                    "difficulty": assessor.comparator.difficulty_map.get(p, 0.3),
-                }
-                for p in phoneme_data["phonemes"]
-                if assessor.comparator.difficulty_map.get(p, 0.3) > 0.6
-            ],
         }
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Word analysis error: {str(e)}")
-@router.get("/health")
-async def health_check():
-    """Simple health check endpoint"""
     return {
-        "status": "healthy",
-        "whisper_model": "tiny",
-        "cmu_dict_size": len(assessor.g2p.cmu_dict),
-        "vietnamese_optimized": True,
     }

+# ENHANCED PRONUNCIATION API - MULTI-WORD SUPPORT
+# Supports any English word using CMU Dict + phoneme libraries
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException, APIRouter
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from typing import List, Dict, Optional, Tuple
 import tempfile
 import os
 import numpy as np
+import librosa
 import nltk
 import eng_to_ipa as ipa
+import pronouncing
+import requests
+import json
 import re
 from collections import defaultdict
 import warnings
 # Download required NLTK data
 try:
     nltk.download("cmudict", quiet=True)
+    nltk.download("punkt", quiet=True)
     from nltk.corpus import cmudict
 except:
     print("Warning: NLTK data not available")
 # =============================================================================
 # MODELS
 # =============================================================================
+router = APIRouter(prefix="/speaking", tags=["AI"])
+class PronunciationResult(BaseModel):
     overall_score: float
+    status: str
     feedback: List[str]
+    words: List[Dict]
+    phoneme_details: List[Dict]
+    audio_info: Dict
+    processing_time: float
+    difficulty_analysis: Dict
+class WordPhonemeInfo(BaseModel):
+    word: str
+    phonemes: List[str]
+    ipa_transcription: str
+    syllables: List[str]
+    stress_pattern: List[int]
 # =============================================================================
+# ENHANCED PHONEME PROCESSOR
 # =============================================================================
+class EnhancedPhonemeProcessor:
+    """Advanced phoneme processing with multiple dictionaries"""
     def __init__(self):
+        self.sample_rate = 16000
+        # Load CMU dictionary
         try:
             self.cmu_dict = cmudict.dict()
         except:
             self.cmu_dict = {}
             print("Warning: CMU dictionary not available")
+        # Load comprehensive phoneme acoustic models
+        self.phoneme_models = self._load_comprehensive_phoneme_models()
+        # Phoneme difficulty for Vietnamese speakers
+        self.difficulty_map = {
+            # Very difficult for Vietnamese
+            "TH": 0.9,  # think, that
+            "DH": 0.9,  # this, then
+            "V": 0.8,  # very, love
+            "Z": 0.8,  # zoo, rise
+            "ZH": 0.9,  # measure, vision
+            "R": 0.7,  # red, car
+            "L": 0.6,  # love, well
+            "W": 0.5,  # water, well
+            # Moderately difficult
+            "F": 0.4,  # fish, life
+            "S": 0.3,  # see, this
+            "SH": 0.5,  # shoe, fish
+            "CH": 0.4,  # chair, much
+            "JH": 0.5,  # job, bridge
+            # Vowels - challenging distinctions
+            "IY": 0.3,  # beat
+            "IH": 0.6,  # bit
+            "EY": 0.4,  # bait
+            "EH": 0.5,  # bet
+            "AE": 0.7,  # bat
+            "AH": 0.4,  # but
+            "AO": 0.6,  # bought
+            "OW": 0.4,  # boat
+            "UH": 0.6,  # book
+            "UW": 0.4,  # boot
+            # Easier sounds
+            "P": 0.2,
+            "B": 0.2,
+            "T": 0.2,
+            "D": 0.2,
+            "K": 0.2,
+            "G": 0.2,
+            "M": 0.2,
+            "N": 0.2,
+            "NG": 0.3,
+        }
+    def get_word_phonemes(self, word: str) -> WordPhonemeInfo:
+        """Get comprehensive phoneme info for any English word"""
+        word_lower = word.lower().strip()
+        # Method 1: CMU Dictionary (most reliable)
+        cmu_phonemes = []
         if word_lower in self.cmu_dict:
+            # Get first pronunciation variant
+            cmu_phonemes = self.cmu_dict[word_lower][0]
+            # Remove stress markers (0,1,2) from vowels
+            cmu_phonemes = [re.sub(r"[0-9]", "", p) for p in cmu_phonemes]
+        # Method 2: eng_to_ipa library
+        ipa_transcription = ""
+        try:
+            ipa_transcription = ipa.convert(word)
+        except:
+            ipa_transcription = f"/{word}/"
+        # Method 3: pronouncing library for syllables
+        syllables = []
         try:
+            syllable_count = pronouncing.syllable_count(word)
+            # Simple syllable division
+            if syllable_count and len(word) > syllable_count:
+                syllable_length = len(word) // syllable_count
+                syllables = [
+                    word[i : i + syllable_length]
+                    for i in range(0, len(word), syllable_length)
+                ]
+            else:
+                syllables = [word]
         except:
+            syllables = [word]
+        # Extract stress pattern from CMU
+        stress_pattern = []
+        if word_lower in self.cmu_dict:
+            for phoneme in self.cmu_dict[word_lower][0]:
+                stress = re.findall(r"[0-9]", phoneme)
+                if stress:
+                    stress_pattern.append(int(stress[0]))
+        # Fallback phonemes if CMU not available
+        if not cmu_phonemes:
+            cmu_phonemes = self._estimate_phonemes(word)
+        return WordPhonemeInfo(
+            word=word,
+            phonemes=cmu_phonemes,
+            ipa_transcription=ipa_transcription,
+            syllables=syllables,
+            stress_pattern=stress_pattern,
+        )
     def _estimate_phonemes(self, word: str) -> List[str]:
         """Estimate phonemes for unknown words"""
+        # Simple grapheme-to-phoneme mapping
         phoneme_map = {
             "ch": ["CH"],
             "sh": ["SH"],
         while i < len(word):
             # Check 2-letter combinations first
+            if i < len(word) - 1:
                 two_char = word[i : i + 2]
                 if two_char in phoneme_map:
                     phonemes.extend(phoneme_map[two_char])
         return phonemes
+    def _load_comprehensive_phoneme_models(self) -> Dict:
+        """Load comprehensive phoneme acoustic models"""
+        # Extended phoneme set với acoustic characteristics
+        models = {}
+        # VOWELS
+        vowel_models = {
+            "IY": {"f1": 270, "f2": 2300, "duration": 150, "type": "vowel"},  # beat
+            "IH": {"f1": 390, "f2": 1990, "duration": 120, "type": "vowel"},  # bit
+            "EY": {"f1": 400, "f2": 2100, "duration": 160, "type": "vowel"},  # bait
+            "EH": {"f1": 550, "f2": 1770, "duration": 130, "type": "vowel"},  # bet
+            "AE": {"f1": 690, "f2": 1660, "duration": 140, "type": "vowel"},  # bat
+            "AH": {"f1": 640, "f2": 1190, "duration": 110, "type": "vowel"},  # but
+            "AO": {"f1": 570, "f2": 840, "duration": 150, "type": "vowel"},  # bought
+            "OW": {"f1": 430, "f2": 1020, "duration": 160, "type": "vowel"},  # boat
+            "UH": {"f1": 450, "f2": 1030, "duration": 120, "type": "vowel"},  # book
+            "UW": {"f1": 310, "f2": 870, "duration": 150, "type": "vowel"},  # boot
+            "ER": {"f1": 490, "f2": 1350, "duration": 140, "type": "vowel"},  # bird
+            "AY": {"f1": 640, "f2": 1190, "duration": 180, "type": "vowel"},  # bite
+            "AW": {"f1": 640, "f2": 1190, "duration": 180, "type": "vowel"},  # bout
+            "OY": {"f1": 570, "f2": 840, "duration": 180, "type": "vowel"},  # boy
         }
+        # CONSONANTS
+        consonant_models = {
+            # Stops
+            "P": {
+                "burst_energy": 0.8,
+                "duration": 80,
+                "type": "stop",
+                "voicing": False,
+            },
+            "B": {"burst_energy": 0.7, "duration": 85, "type": "stop", "voicing": True},
+            "T": {
+                "burst_energy": 0.9,
+                "duration": 75,
+                "type": "stop",
+                "voicing": False,
+            },
+            "D": {
+                "burst_energy": 0.75,
+                "duration": 80,
+                "type": "stop",
+                "voicing": True,
+            },
+            "K": {
+                "burst_energy": 0.85,
+                "duration": 70,
+                "type": "stop",
+                "voicing": False,
+            },
+            "G": {"burst_energy": 0.7, "duration": 75, "type": "stop", "voicing": True},
+            # Fricatives (challenging for Vietnamese)
+            "F": {
+                "high_freq": True,
+                "duration": 120,
+                "type": "fricative",
+                "voicing": False,
+            },
+            "V": {
+                "high_freq": True,
+                "duration": 110,
+                "type": "fricative",
+                "voicing": True,
+            },
+            "TH": {
+                "high_freq": True,
+                "duration": 130,
+                "type": "fricative",
+                "voicing": False,
+            },  # think
+            "DH": {
+                "high_freq": True,
+                "duration": 120,
+                "type": "fricative",
+                "voicing": True,
+            },  # this
+            "S": {
+                "very_high_freq": True,
+                "duration": 140,
+                "type": "fricative",
+                "voicing": False,
+            },
+            "Z": {
+                "very_high_freq": True,
+                "duration": 130,
+                "type": "fricative",
+                "voicing": True,
+            },
+            "SH": {
+                "high_freq": True,
+                "duration": 150,
+                "type": "fricative",
+                "voicing": False,
+            },  # shoe
+            "ZH": {
+                "high_freq": True,
+                "duration": 140,
+                "type": "fricative",
+                "voicing": True,
+            },  # measure
+            "HH": {
+                "breathy": True,
+                "duration": 100,
+                "type": "fricative",
+                "voicing": False,
+            },  # hello
+            # Affricates
+            "CH": {
+                "burst_fricative": True,
+                "duration": 160,
+                "type": "affricate",
+                "voicing": False,
+            },  # chair
+            "JH": {
+                "burst_fricative": True,
+                "duration": 150,
+                "type": "affricate",
+                "voicing": True,
+            },  # job
+            # Nasals
+            "M": {"nasal": True, "duration": 100, "type": "nasal", "voicing": True},
+            "N": {"nasal": True, "duration": 95, "type": "nasal", "voicing": True},
+            "NG": {
+                "nasal": True,
+                "duration": 105,
+                "type": "nasal",
+                "voicing": True,
+            },  # ring
+            # Liquids (challenging L/R distinction)
+            "L": {"lateral": True, "duration": 90, "type": "liquid", "voicing": True},
+            "R": {"retroflex": True, "duration": 95, "type": "liquid", "voicing": True},
+            # Glides
+            "Y": {"glide": True, "duration": 70, "type": "glide", "voicing": True},
+            "W": {"glide": True, "duration": 75, "type": "glide", "voicing": True},
         }
+        # Combine models
+        models.update(vowel_models)
+        models.update(consonant_models)
+        return models
+    def get_difficulty_score(self, phonemes: List[str]) -> float:
+        """Calculate difficulty score for Vietnamese speakers"""
+        if not phonemes:
+            return 0.5
+        difficulties = []
+        for phoneme in phonemes:
+            # Remove stress markers
+            clean_phoneme = re.sub(r"[0-9]", "", phoneme)
+            difficulty = self.difficulty_map.get(clean_phoneme, 0.3)
+            difficulties.append(difficulty)
+        return np.mean(difficulties)
+    def score_phoneme_advanced(
+        self, phoneme: str, segment_features: Dict, context: Dict = None
+    ) -> float:
+        """Advanced phoneme scoring với context"""
+        clean_phoneme = re.sub(r"[0-9]", "", phoneme)
+        if clean_phoneme not in self.phoneme_models:
+            return 0.5  # Unknown phoneme
+        model = self.phoneme_models[clean_phoneme]
+        score = 0.0
+        # Type-specific scoring
+        if model["type"] == "vowel":
+            score = self._score_vowel(clean_phoneme, segment_features, model)
+        elif model["type"] == "fricative":
+            score = self._score_fricative(clean_phoneme, segment_features, model)
+        elif model["type"] == "stop":
+            score = self._score_stop(clean_phoneme, segment_features, model)
+        elif model["type"] in ["liquid", "nasal", "glide", "affricate"]:
+            score = self._score_other_consonant(clean_phoneme, segment_features, model)
+        # Context adjustments
+        if context:
+            score = self._apply_context_adjustments(score, clean_phoneme, context)
+        # Difficulty adjustment for Vietnamese speakers
+        difficulty = self.difficulty_map.get(clean_phoneme, 0.3)
+        # Easier scoring for more difficult phonemes
+        adjusted_score = score + (difficulty * 0.1)
+        return np.clip(adjusted_score, 0, 1)
+    def _score_vowel(self, phoneme: str, features: Dict, model: Dict) -> float:
+        """Score vowel phoneme"""
+        score = 0.0
+        # Energy check (vowels should have good energy)
+        if features.get("rms_mean", 0) > 0.01:
+            score += 0.3
+        # Spectral characteristics
+        centroid = features.get("spectral_centroid_mean", 0)
+        target_f2 = model.get("f2", 1500)
+        # F2 approximation from spectral centroid
+        f2_error = abs(centroid - target_f2) / target_f2
+        f2_score = max(0, 1 - f2_error)
+        score += 0.4 * f2_score
+        # Stability (vowels should be stable)
+        zcr = features.get("zcr_mean", 0)
+        if zcr < 0.1:  # Low zero crossing for vowels
+            score += 0.3
+        return score
+    def _score_fricative(self, phoneme: str, features: Dict, model: Dict) -> float:
+        """Score fricative phoneme"""
+        score = 0.0
+        # High frequency content for fricatives
+        centroid = features.get("spectral_centroid_mean", 0)
+        zcr = features.get("zcr_mean", 0)
+        if model.get("very_high_freq"):  # S, Z sounds
+            if centroid > 3000:
+                score += 0.4
+            if zcr > 0.2:
+                score += 0.4
+        elif model.get("high_freq"):  # F, V, TH, DH, SH, ZH
+            if centroid > 1500:
+                score += 0.4
+            if zcr > 0.15:
+                score += 0.3
+        # Voicing check
+        energy = features.get("rms_mean", 0)
+        if model.get("voicing") and energy > 0.01:  # Voiced fricatives
+            score += 0.2
+        elif not model.get("voicing") and energy < 0.05:  # Voiceless fricatives
+            score += 0.2
+        return score
+    def _score_stop(self, phoneme: str, features: Dict, model: Dict) -> float:
+        """Score stop consonant"""
+        score = 0.0
+        # Burst energy
+        energy = features.get("rms_mean", 0)
+        burst_threshold = 0.02 if model.get("voicing") else 0.03
+        if energy > burst_threshold:
+            score += 0.6
+        # Duration check
+        # Stops should be relatively short
+        score += 0.4  # Base score for presence
+        return score
+    def _score_other_consonant(
+        self, phoneme: str, features: Dict, model: Dict
+    ) -> float:
+        """Score other consonant types"""
+        score = 0.0
+        energy = features.get("rms_mean", 0)
+        centroid = features.get("spectral_centroid_mean", 0)
+        zcr = features.get("zcr_mean", 0)
+        if model["type"] == "liquid":
+            # L/R sounds - moderate energy, specific spectral characteristics
+            if 0.01 <= energy <= 0.08:
+                score += 0.3
+            if phoneme == "R" and centroid < 1800:  # R lowers F3
+                score += 0.4
+            elif phoneme == "L" and 1200 <= centroid <= 2200:
+                score += 0.4
+            score += 0.3  # Base score
+        elif model["type"] == "nasal":
+            # Nasal sounds - good energy, specific spectral pattern
+            if energy > 0.005:
+                score += 0.4
+            if 800 <= centroid <= 2000:
+                score += 0.3
+            score += 0.3
+        elif model["type"] == "glide":
+            # W/Y sounds - transition characteristics
+            if energy > 0.005:
+                score += 0.5
+            score += 0.5
+        elif model["type"] == "affricate":
+            # CH/JH - combination of stop + fricative
+            if energy > 0.02:  # Burst component
+                score += 0.3
+            if zcr > 0.1:  # Fricative component
+                score += 0.4
+            score += 0.3
+        return score
+    def _apply_context_adjustments(
+        self, score: float, phoneme: str, context: Dict
+    ) -> float:
+        """Apply contextual adjustments"""
+        # Position in word adjustments
+        position = context.get("position", "middle")
+        if position == "initial" and phoneme in ["TH", "DH"]:
+            score *= 1.1  # Easier in initial position
+        elif position == "final" and phoneme in ["T", "D", "K", "G"]:
+            score *= 0.9  # Harder in final position (Vietnamese tendency to drop)
+        # Surrounding phonemes
+        prev_phoneme = context.get("prev_phoneme")
+        next_phoneme = context.get("next_phoneme")
+        # Consonant clusters (difficult for Vietnamese)
+        if (
+            prev_phoneme
+            and prev_phoneme in ["S", "T", "K"]
+            and phoneme in ["T", "K", "P"]
+        ):
+            score *= 0.8  # Consonant clusters are harder
+        return score
+# =============================================================================
+# ENHANCED PRONUNCIATION ASSESSOR
+# =============================================================================
+class EnhancedPronunciationAssessor:
+    """Enhanced assessor supporting any English word"""
     def __init__(self):
+        self.phoneme_processor = EnhancedPhonemeProcessor()
         self.sample_rate = 16000
+    def process_audio_file(self, file_path: str, reference_text: str) -> Dict:
+        """Process audio file with enhanced phoneme analysis"""
+        # Load and validate audio
+        audio, sr = librosa.load(file_path, sr=self.sample_rate)
+        duration = len(audio) / sr
+        max_amplitude = np.max(np.abs(audio))
+        # Audio quality analysis
+        audio_info = self._analyze_audio_quality(audio, duration, max_amplitude)
+        # Extract comprehensive features
+        features = self._extract_comprehensive_features(audio)
+        # Text analysis
+        text_analysis = self._analyze_text(reference_text)
+        # Pronunciation assessment
+        pronunciation_analysis = self._assess_pronunciation(
+            audio, features, reference_text, text_analysis
         )
+        return {
+            "audio_info": audio_info,
+            "text_analysis": text_analysis,
+            "pronunciation_analysis": pronunciation_analysis,
+            "features": features,
+        }
+    def _analyze_audio_quality(
+        self, audio: np.ndarray, duration: float, max_amplitude: float
+    ) -> Dict:
+        """Comprehensive audio quality analysis"""
+        issues = []
+        quality_score = 1.0
+        # Duration checks
+        if duration < 0.5:
+            issues.append("too_short")
+            quality_score *= 0.5
+        elif duration > 30:
+            issues.append("too_long")
+            quality_score *= 0.8
+        # Amplitude checks
+        if max_amplitude < 0.005:
+            issues.append("too_quiet")
+            quality_score *= 0.6
+        elif max_amplitude > 0.98:
+            issues.append("clipped")
+            quality_score *= 0.7
+        # Noise analysis
+        noise_floor = np.mean(np.abs(audio[: int(0.1 * len(audio))]))  # First 100ms
+        if noise_floor > 0.02:
+            issues.append("noisy")
+            quality_score *= 0.8
+        # Signal-to-noise ratio
+        signal_power = np.mean(audio**2)
+        snr = 10 * np.log10(signal_power / (noise_floor**2 + 1e-10))
+        return {
+            "duration": duration,
+            "max_amplitude": max_amplitude,
+            "noise_floor": noise_floor,
+            "snr": snr,
+            "quality_score": quality_score,
+            "issues": issues,
+            "quality_status": "good" if not issues else ",".join(issues),
+        }
+    def _extract_comprehensive_features(self, audio: np.ndarray) -> Dict:
+        """Extract comprehensive acoustic features"""
+        features = {}
+        # Basic features
+        features["mfcc"] = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13)
+        features["mfcc_mean"] = np.mean(features["mfcc"], axis=1).tolist()
+        # Energy features
+        rms = librosa.feature.rms(y=audio, hop_length=512)[0]
+        features["rms"] = rms.tolist()
+        features["rms_mean"] = float(np.mean(rms))
+        features["rms_std"] = float(np.std(rms))
+        # Spectral features
+        spectral_centroid = librosa.feature.spectral_centroid(
+            y=audio, sr=self.sample_rate
+        )[0]
+        features["spectral_centroid"] = spectral_centroid.tolist()
+        features["spectral_centroid_mean"] = float(np.mean(spectral_centroid))
+        features["spectral_centroid_std"] = float(np.std(spectral_centroid))
+        # Additional spectral features
+        spectral_bandwidth = librosa.feature.spectral_bandwidth(
+            y=audio, sr=self.sample_rate
+        )[0]
+        features["spectral_bandwidth_mean"] = float(np.mean(spectral_bandwidth))
+        spectral_rolloff = librosa.feature.spectral_rolloff(
+            y=audio, sr=self.sample_rate
+        )[0]
+        features["spectral_rolloff_mean"] = float(np.mean(spectral_rolloff))
+        # Zero crossing rate
+        zcr = librosa.feature.zero_crossing_rate(audio, hop_length=512)[0]
+        features["zcr"] = zcr.tolist()
+        features["zcr_mean"] = float(np.mean(zcr))
+        features["zcr_std"] = float(np.std(zcr))
+        # Pitch analysis
+        pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate)
+        f0 = []
+        for t in range(pitches.shape[1]):
+            index = magnitudes[:, t].argmax()
+            pitch = pitches[index, t]
+            f0.append(
+                float(pitch) if pitch > 80 else 0.0
+            )  # Filter out very low frequencies
+        features["f0"] = f0
+        valid_f0 = [f for f in f0 if f > 0]
+        features["f0_mean"] = float(np.mean(valid_f0)) if valid_f0 else 0.0
+        features["f0_std"] = float(np.std(valid_f0)) if valid_f0 else 0.0
+        # Formant estimation (simplified)
+        features["formants"] = self._estimate_formants(audio)
+        return features
+    def _analyze_text(self, text: str) -> Dict:
+        """Analyze reference text for phonemes and difficulty"""
+        words = text.lower().strip().split()
+        text_info = {
+            "words": [],
+            "total_phonemes": 0,
+            "difficulty_score": 0,
+            "challenging_sounds": [],
+        }
+        all_phonemes = []
+        for word in words:
+            word_info = self.phoneme_processor.get_word_phonemes(word)
+            # Calculate word difficulty
+            word_difficulty = self.phoneme_processor.get_difficulty_score(
+                word_info.phonemes
+            )
+            # Find challenging phonemes
+            challenging = []
+            for phoneme in word_info.phonemes:
+                clean_phoneme = re.sub(r"[0-9]", "", phoneme)
+                difficulty = self.phoneme_processor.difficulty_map.get(clean_phoneme, 0)
+                if difficulty > 0.6:
+                    challenging.append(clean_phoneme)
+            word_data = {
+                "word": word,
+                "phonemes": word_info.phonemes,
+                "ipa": word_info.ipa_transcription,
+                "syllables": word_info.syllables,
+                "difficulty": word_difficulty,
+                "challenging_phonemes": challenging,
+            }
+            text_info["words"].append(word_data)
+            all_phonemes.extend(word_info.phonemes)
+            text_info["challenging_sounds"].extend(challenging)
+        text_info["total_phonemes"] = len(all_phonemes)
+        text_info["difficulty_score"] = self.phoneme_processor.get_difficulty_score(
+            all_phonemes
         )
+        text_info["challenging_sounds"] = list(
+            set(text_info["challenging_sounds"])
+        )  # Remove duplicates
+        return text_info
+    def _assess_pronunciation(
+        self, audio: np.ndarray, features: Dict, text: str, text_analysis: Dict
+    ) -> Dict:
+        """Comprehensive pronunciation assessment"""
+        words = text.lower().strip().split()
+        word_segments = self._segment_words_advanced(audio, features, len(words))
+        word_results = []
+        phoneme_results = []
+        for i, word in enumerate(words):
+            if i < len(word_segments):
+                word_audio = word_segments[i]
+                word_info = text_analysis["words"][i]
+                # Assess word
+                word_result = self._assess_word_comprehensive(
+                    word_audio, word_info, features, i, len(words)
+                )
+                word_results.append(word_result)
+                phoneme_results.extend(word_result["phoneme_details"])
+        # Calculate overall metrics
+        overall_score = (
+            np.mean([wr["score"] for wr in word_results]) if word_results else 0.0
+        )
+        # Generate comprehensive feedback
+        feedback = self._generate_comprehensive_feedback(
+            word_results, text_analysis, features, overall_score
+        )
+        # Difficulty analysis
+        difficulty_analysis = self._analyze_difficulty_performance(
+            word_results, text_analysis
         )
         return {
             "overall_score": overall_score,
+            "words": word_results,
+            "phoneme_details": phoneme_results,
             "feedback": feedback,
+            "status": self._get_status(overall_score),
+            "difficulty_analysis": difficulty_analysis,
         }
+    def _segment_words_advanced(
+        self, audio: np.ndarray, features: Dict, num_words: int
+    ) -> List[np.ndarray]:
+        """Advanced word segmentation using energy and spectral cues"""
+        if num_words == 1:
+            return [audio]
+        # Use RMS energy to find word boundaries
+        rms = features["rms"]
+        # Find energy peaks (potential word centers)
+        from scipy.signal import find_peaks
+        # Smooth RMS for better peak detection
+        window_size = min(5, len(rms) // 4)
+        if window_size > 0:
+            rms_smooth = np.convolve(
+                rms, np.ones(window_size) / window_size, mode="same"
+            )
+        else:
+            rms_smooth = rms
+        peaks, _ = find_peaks(
+            rms_smooth,
+            height=np.mean(rms_smooth) * 0.5,
+            distance=len(rms) // (num_words * 2),
+        )
+        # If we don't find enough peaks, fall back to equal division
+        if len(peaks) < num_words:
+            segment_length = len(audio) // num_words
+            segments = []
+            for i in range(num_words):
+                start = i * segment_length
+                end = start + segment_length if i < num_words - 1 else len(audio)
+                segments.append(audio[start:end])
+            return segments
+        # Use peaks to define word boundaries
+        hop_length = 512
+        peak_times = librosa.frames_to_samples(peaks, hop_length=hop_length)
+        segments = []
+        for i in range(num_words):
+            if i == 0:
+                start = 0
+                end = peak_times[min(i, len(peak_times) - 1)] + len(audio) // (
+                    num_words * 4
+                )
+            elif i == num_words - 1:
+                start = peak_times[min(i - 1, len(peak_times) - 1)] - len(audio) // (
+                    num_words * 4
+                )
+                end = len(audio)
+            else:
+                start = peak_times[min(i - 1, len(peak_times) - 1)] - len(audio) // (
+                    num_words * 6
+                )
+                end = peak_times[min(i, len(peak_times) - 1)] + len(audio) // (
+                    num_words * 6
+                )
+            start = max(0, start)
+            end = min(len(audio), end)
+            segments.append(audio[start:end])
+        return segments
+    def _assess_word_comprehensive(
         self,
+        word_audio: np.ndarray,
+        word_info: Dict,
+        global_features: Dict,
+        word_index: int,
+        total_words: int,
+    ) -> Dict:
+        """Comprehensive word assessment"""
+        if len(word_audio) < 500:
+            return {
+                "word": word_info["word"],
+                "score": 0.2,
+                "status": "poor",
+                "issues": ["too_short"],
+                "phoneme_details": [],
             }
+        # Extract word-level features
+        word_features = self._extract_word_features(word_audio)
+        # Assess each phoneme
+        phonemes = word_info["phonemes"]
+        phoneme_segments = self._segment_phonemes(word_audio, len(phonemes))
+        phoneme_scores = []
+        phoneme_details = []
+        for i, (phoneme, segment) in enumerate(zip(phonemes, phoneme_segments)):
+            if len(segment) > 100:  # Minimum segment length
+                segment_features = self._extract_segment_features(segment)
+                # Context information
+                context = {
+                    "position": (
+                        "initial"
+                        if i == 0
+                        else "final" if i == len(phonemes) - 1 else "middle"
                     ),
+                    "prev_phoneme": phonemes[i - 1] if i > 0 else None,
+                    "next_phoneme": phonemes[i + 1] if i < len(phonemes) - 1 else None,
+                    "word_position": word_index / total_words,
                 }
+                score = self.phoneme_processor.score_phoneme_advanced(
+                    phoneme, segment_features, context
+                )
+                phoneme_scores.append(score)
+                phoneme_details.append(
+                    {
+                        "phoneme": phoneme,
+                        "score": score,
+                        "position": context["position"],
+                        "difficulty": self.phoneme_processor.difficulty_map.get(
+                            re.sub(r"[0-9]", "", phoneme), 0.3
+                        ),
+                        "word": word_info["word"],
+                    }
+                )
+        # Word-level score
+        word_score = np.mean(phoneme_scores) if phoneme_scores else 0.0
+        # Detect issues
+        issues = []
+        if word_score < 0.3:
+            issues.append("very_poor_clarity")
+        if word_features.get("rms_mean", 0) < 0.005:
+            issues.append("too_quiet")
+        if word_features.get("zcr_mean", 0) > 0.3:
+            issues.append("too_noisy")
+        return {
+            "word": word_info["word"],
+            "score": word_score,
+            "status": self._get_word_status(word_score),
+            "phonemes": phonemes,
+            "phoneme_scores": phoneme_scores,
+            "phoneme_details": phoneme_details,
+            "ipa": word_info["ipa"],
+            "syllables": word_info["syllables"],
+            "difficulty": word_info["difficulty"],
+            "issues": issues,
+        }
+    def _extract_word_features(self, word_audio: np.ndarray) -> Dict:
+        """Extract features for word segment"""
+        if len(word_audio) < 100:
+            return {}
+        mfcc = librosa.feature.mfcc(y=word_audio, sr=self.sample_rate, n_mfcc=13)
+        rms = librosa.feature.rms(y=word_audio)[0]
+        centroid = librosa.feature.spectral_centroid(y=word_audio, sr=self.sample_rate)[
+            0
+        ]
+        zcr = librosa.feature.zero_crossing_rate(word_audio)[0]
+        return {
+            "mfcc_mean": np.mean(mfcc, axis=1).tolist(),
+            "rms_mean": float(np.mean(rms)),
+            "spectral_centroid_mean": float(np.mean(centroid)),
+            "zcr_mean": float(np.mean(zcr)),
+        }
+    def _segment_phonemes(
+        self, word_audio: np.ndarray, num_phonemes: int
+    ) -> List[np.ndarray]:
+        """Segment word audio into phonemes"""
+        if num_phonemes <= 1:
+            return [word_audio]
+        segment_length = len(word_audio) // num_phonemes
+        segments = []
+        for i in range(num_phonemes):
+            start = i * segment_length
+            end = start + segment_length if i < num_phonemes - 1 else len(word_audio)
+            segments.append(word_audio[start:end])
+        return segments
+    def _extract_segment_features(self, segment: np.ndarray) -> Dict:
+        """Extract features for phoneme segment"""
+        if len(segment) < 50:
+            return {}
+        # Basic features for short segments
+        rms_mean = float(np.mean(librosa.feature.rms(y=segment)[0]))
+        zcr_mean = float(np.mean(librosa.feature.zero_crossing_rate(segment)[0]))
+        # Spectral centroid
+        centroid = librosa.feature.spectral_centroid(y=segment, sr=self.sample_rate)[0]
+        centroid_mean = float(np.mean(centroid))
+        # MFCC for short segment
+        if len(segment) > 512:
+            mfcc = librosa.feature.mfcc(y=segment, sr=self.sample_rate, n_mfcc=5)
+            mfcc_mean = np.mean(mfcc, axis=1).tolist()
+        else:
+            mfcc_mean = [0] * 5
+        return {
+            "rms_mean": rms_mean,
+            "zcr_mean": zcr_mean,
+            "spectral_centroid_mean": centroid_mean,
+            "mfcc_mean": mfcc_mean,
+        }
+    def _generate_comprehensive_feedback(
         self,
+        word_results: List[Dict],
+        text_analysis: Dict,
+        features: Dict,
         overall_score: float,
     ) -> List[str]:
+        """Generate comprehensive feedback"""
         feedback = []
+        # Overall performance feedback
+        if overall_score >= 0.85:
+            feedback.append(
+                "🎉 Outstanding pronunciation! You sound very natural and clear."
+            )
+        elif overall_score >= 0.7:
+            feedback.append(
+                "👍 Great job! Your pronunciation is quite good with room for minor improvements."
+            )
+        elif overall_score >= 0.5:
+            feedback.append(
+                "📚 Good progress! Keep practicing the areas highlighted below."
+            )
+        elif overall_score >= 0.3:
             feedback.append(
+                "🔄 Keep working on it! Focus on clarity and the specific sounds mentioned."
             )
         else:
+            feedback.append(
+                "💪 Don't give up! Start with slower, clearer pronunciation."
+            )
+        # Audio quality feedback
+        audio_quality = features.get("rms_mean", 0)
+        if audio_quality < 0.01:
+            feedback.append(
+                "🔊 Try speaking louder and more clearly - your recording was quite quiet."
+            )
+        elif audio_quality > 0.15:
+            feedback.append("🔉 Good volume level! Your voice comes through clearly.")
+        # Pitch variation feedback
+        pitch_std = features.get("f0_std", 0)
+        if pitch_std < 20:
+            feedback.append(
+                "🎵 Try adding more natural pitch variation to sound more engaging."
+            )
+        elif pitch_std > 80:
+            feedback.append(
+                "🎵 Good pitch variation! Your speech sounds natural and expressive."
+            )
+        # Word-specific feedback
+        poor_words = [wr for wr in word_results if wr["score"] < 0.5]
+        if poor_words:
+            word_names = [w["word"] for w in poor_words]
+            feedback.append(f"🎯 Focus extra practice on: {', '.join(word_names)}")
         # Phoneme-specific feedback for Vietnamese speakers
+        all_challenging = []
+        for word_result in word_results:
+            for phoneme_detail in word_result.get("phoneme_details", []):
+                if phoneme_detail["score"] < 0.5 and phoneme_detail["difficulty"] > 0.6:
+                    all_challenging.append(phoneme_detail["phoneme"])
+        if all_challenging:
+            unique_challenging = list(set(all_challenging))
+            vietnamese_tips = {
+                "TH": "Put your tongue between your teeth and blow air gently",
+                "DH": "Same tongue position as TH, but vibrate your vocal cords",
+                "V": "Touch your bottom lip to your top teeth, then voice",
+                "R": "Curl your tongue without touching the roof of your mouth",
+                "L": "Touch your tongue tip to the roof of your mouth",
+                "Z": "Like 'S' but with vocal cord vibration",
+            }
+            for phoneme in unique_challenging[:3]:  # Top 3 challenging
+                clean_phoneme = re.sub(r"[0-9]", "", phoneme)
+                if clean_phoneme in vietnamese_tips:
+                    feedback.append(
+                        f"🔤 {clean_phoneme} sound: {vietnamese_tips[clean_phoneme]}"
+                    )
+        # Difficulty-based encouragement
+        text_difficulty = text_analysis["difficulty_score"]
+        if text_difficulty > 0.7 and overall_score > 0.6:
+            feedback.append(
+                "💪 Impressive! You tackled some very challenging sounds for Vietnamese speakers."
             )
+        elif text_difficulty < 0.3 and overall_score < 0.7:
+            feedback.append("📈 Try some more challenging words as you improve!")
         return feedback
+    def _analyze_difficulty_performance(
+        self, word_results: List[Dict], text_analysis: Dict
+    ) -> Dict:
+        """Analyze performance vs difficulty"""
+        easy_phonemes = []  # difficulty < 0.4
+        medium_phonemes = []  # 0.4 <= difficulty < 0.7
+        hard_phonemes = []  # difficulty >= 0.7
+        for word_result in word_results:
+            for phoneme_detail in word_result.get("phoneme_details", []):
+                difficulty = phoneme_detail["difficulty"]
+                score = phoneme_detail["score"]
+                if difficulty < 0.4:
+                    easy_phonemes.append(score)
+                elif difficulty < 0.7:
+                    medium_phonemes.append(score)
+                else:
+                    hard_phonemes.append(score)
+        return {
+            "easy_sounds_avg": float(np.mean(easy_phonemes)) if easy_phonemes else 0.0,
+            "medium_sounds_avg": (
+                float(np.mean(medium_phonemes)) if medium_phonemes else 0.0
+            ),
+            "hard_sounds_avg": float(np.mean(hard_phonemes)) if hard_phonemes else 0.0,
+            "total_challenging_sounds": len(hard_phonemes),
+            "mastered_difficult_sounds": len([s for s in hard_phonemes if s > 0.7]),
+            "text_difficulty": text_analysis["difficulty_score"],
+        }
     def _get_word_status(self, score: float) -> str:
         """Get word status from score"""
         if score >= 0.8:
         else:
             return "poor"
+    def _get_status(self, score: float) -> str:
+        """Get overall status"""
+        return self._get_word_status(score)
 # =============================================================================
+# ENHANCED FASTAPI APP
 # =============================================================================
+# Initialize enhanced processor
+assessor = EnhancedPronunciationAssessor()
+# =============================================================================
+# ENHANCED ENDPOINTS
+# =============================================================================
+@router.post("/assess", response_model=PronunciationResult)
 async def assess_pronunciation(
+    audio: UploadFile = File(..., description="Audio file"),
+    reference_text: str = Form(..., description="Any English text"),
+    difficulty_level: str = Form("medium", description="easy, medium, hard"),
 ):
     """
+    Assess pronunciation for ANY English text
+    Supports 60,000+ words from CMU Pronouncing Dictionary
     """
     import time
     start_time = time.time()
+    print(f"Starting pronunciation assessment...")
+    print("Reference text:", reference_text)
+    print("Difficulty level:", difficulty_level)
+    print("Audio filename:", audio.filename if audio else "None")
     # Validate inputs
     if not reference_text.strip():
+        print("Validation failed: Reference text is empty")
         raise HTTPException(status_code=400, detail="Reference text cannot be empty")
+    if len(reference_text) > 1000:
+        print("Validation failed: Reference text too long")
         raise HTTPException(
+            status_code=400, detail="Reference text too long (max 1000 characters)"
         )
+    # Check if text contains only valid characters
+    # Updated regex to be more permissive and include common punctuation like commas
     if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text):
+        print("Validation failed: Invalid characters in text")
+        print("Text that failed validation:", repr(reference_text))
         raise HTTPException(
             status_code=400,
+            detail="Text contains invalid characters. Only English letters, spaces, and basic punctuation (,.'-!?;:) allowed.",
         )
     try:
+        # Save uploaded file
+        print("Saving uploaded file...")
+        # Handle cases where filename might be None or empty
         file_extension = ".wav"
+        if audio.filename:
+            file_extension = f".{audio.filename.split('.')[-1]}" if '.' in audio.filename else ".wav"
         with tempfile.NamedTemporaryFile(
             delete=False, suffix=file_extension
         ) as tmp_file:
             content = await audio.read()
             tmp_file.write(content)
             tmp_file.flush()
+            print("File saved to:", tmp_file.name)
+            print("File size:", len(content), "bytes")
+            # Process with enhanced assessor
+            print("Processing audio file...")
+            result = assessor.process_audio_file(tmp_file.name, reference_text)
+            print("Audio processing completed")
+            # Clean up
             os.unlink(tmp_file.name)
+        # Apply difficulty adjustments
+        analysis = result["pronunciation_analysis"]
+        if difficulty_level == "easy":
+            analysis["overall_score"] = min(1.0, analysis["overall_score"] * 1.2)
+            for word in analysis["words"]:
+                word["score"] = min(1.0, word["score"] * 1.2)
+        elif difficulty_level == "hard":
+            analysis["overall_score"] = analysis["overall_score"] * 0.8
+            for word in analysis["words"]:
+                word["score"] = word["score"] * 0.8
         processing_time = time.time() - start_time
+        print("Processing completed successfully in", processing_time, "seconds")
+        return PronunciationResult(
+            overall_score=analysis["overall_score"],
+            status=analysis["status"],
+            feedback=analysis["feedback"],
+            words=analysis["words"],
+            phoneme_details=analysis["phoneme_details"],
+            audio_info=result["audio_info"],
+            processing_time=processing_time,
+            difficulty_analysis=analysis["difficulty_analysis"],
+        )
     except Exception as e:
+        print("Exception occurred during processing:", str(e))
         import traceback
         traceback.print_exc()
+        raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
 @router.get("/phonemes/{word}")
 async def get_word_phonemes(word: str):
+    """Get comprehensive phoneme information for ANY English word"""
     try:
+        word_info = assessor.phoneme_processor.get_word_phonemes(word)
+        # Calculate difficulty for Vietnamese speakers
+        difficulty = assessor.phoneme_processor.get_difficulty_score(word_info.phonemes)
+        # Get challenging phonemes
+        challenging_phonemes = []
+        for phoneme in word_info.phonemes:
+            clean_phoneme = re.sub(r"[0-9]", "", phoneme)
+            phoneme_difficulty = assessor.phoneme_processor.difficulty_map.get(
+                clean_phoneme, 0
+            )
+            if phoneme_difficulty > 0.6:
+                challenging_phonemes.append(
+                    {
+                        "phoneme": clean_phoneme,
+                        "difficulty": phoneme_difficulty,
+                        "tips": get_phoneme_tips(clean_phoneme),
+                    }
+                )
         return {
             "word": word,
+            "phonemes": word_info.phonemes,
+            "ipa_transcription": word_info.ipa_transcription,
+            "syllables": word_info.syllables,
+            "stress_pattern": word_info.stress_pattern,
+            "difficulty_score": difficulty,
+            "difficulty_level": (
+                "hard" if difficulty > 0.7 else "medium" if difficulty > 0.4 else "easy"
+            ),
+            "challenging_phonemes": challenging_phonemes,
+            "pronunciation_tips": get_word_pronunciation_tips(word, word_info.phonemes),
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing word: {str(e)}")
+@router.post("/analyze/text")
+async def analyze_text_difficulty(text: str = Form(...)):
+    """Analyze pronunciation difficulty of any English text"""
+    try:
+        text_analysis = assessor._analyze_text(text)
+        return {
+            "text": text,
+            "word_count": len(text_analysis["words"]),
+            "total_phonemes": text_analysis["total_phonemes"],
+            "overall_difficulty": text_analysis["difficulty_score"],
             "difficulty_level": (
                 "hard"
+                if text_analysis["difficulty_score"] > 0.7
+                else "medium" if text_analysis["difficulty_score"] > 0.4 else "easy"
             ),
+            "challenging_sounds": text_analysis["challenging_sounds"],
+            "word_breakdown": text_analysis["words"],
+            "recommendations": get_text_recommendations(text_analysis),
         }
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Text analysis error: {str(e)}")
+@router.get("/dictionary/search")
+async def search_dictionary(query: str, limit: int = 20):
+    """Search CMU dictionary for words containing query"""
+    try:
+        cmu_dict = assessor.phoneme_processor.cmu_dict
+        # Search for words containing the query
+        matching_words = []
+        query_lower = query.lower()
+        for word in cmu_dict.keys():
+            if query_lower in word and len(matching_words) < limit:
+                word_info = assessor.phoneme_processor.get_word_phonemes(word)
+                difficulty = assessor.phoneme_processor.get_difficulty_score(
+                    word_info.phonemes
+                )
+                matching_words.append(
+                    {
+                        "word": word,
+                        "phonemes": word_info.phonemes,
+                        "ipa": word_info.ipa_transcription,
+                        "difficulty": difficulty,
+                        "difficulty_level": (
+                            "hard"
+                            if difficulty > 0.7
+                            else "medium" if difficulty > 0.4 else "easy"
+                        ),
+                    }
+                )
+        # Sort by difficulty (easiest first)
+        matching_words.sort(key=lambda x: x["difficulty"])
+        return {"query": query, "found": len(matching_words), "words": matching_words}
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Dictionary search error: {str(e)}"
+        )
+@router.get("/practice/level/{level}")
+async def get_practice_words(level: str, count: int = 10):
+    """Get practice words by difficulty level"""
+    if level not in ["easy", "medium", "hard"]:
+        raise HTTPException(
+            status_code=400, detail="Level must be easy, medium, or hard"
+        )
+    try:
+        cmu_dict = assessor.phoneme_processor.cmu_dict
+        practice_words = []
+        # Define difficulty ranges
+        if level == "easy":
+            difficulty_range = (0, 0.4)
+        elif level == "medium":
+            difficulty_range = (0.4, 0.7)
+        else:  # hard
+            difficulty_range = (0.7, 1.0)
+        # Sample words from dictionary
+        word_list = list(cmu_dict.keys())
+        np.random.shuffle(word_list)
+        for word in word_list:
+            if len(practice_words) >= count:
+                break
+            # Skip very short or very long words
+            if len(word) < 3 or len(word) > 12:
+                continue
+            # Skip words with special characters
+            if not word.isalpha():
+                continue
+            word_info = assessor.phoneme_processor.get_word_phonemes(word)
+            difficulty = assessor.phoneme_processor.get_difficulty_score(
+                word_info.phonemes
+            )
+            if difficulty_range[0] <= difficulty <= difficulty_range[1]:
+                practice_words.append(
+                    {
+                        "word": word,
+                        "phonemes": word_info.phonemes,
+                        "ipa": word_info.ipa_transcription,
+                        "difficulty": difficulty,
+                        "tips": get_word_pronunciation_tips(word, word_info.phonemes),
+                    }
+                )
+        return {
+            "level": level,
+            "difficulty_range": difficulty_range,
+            "count": len(practice_words),
+            "words": practice_words,
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Practice words error: {str(e)}")
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
+def get_phoneme_tips(phoneme: str) -> List[str]:
+    """Get pronunciation tips for specific phonemes"""
+    tips_dict = {
+        "TH": [
+            "Place tongue tip between upper and lower teeth",
+            "Blow air gently while keeping tongue in position",
+            "Should feel air flowing over tongue",
+        ],
+        "DH": [
+            "Same tongue position as TH",
+            "Add vocal cord vibration",
+            "Should feel buzzing in throat",
+        ],
+        "V": [
+            "Touch bottom lip to upper teeth",
+            "Voice while air flows through the gap",
+            "Don't use both lips like Vietnamese 'V'",
+        ],
+        "R": [
+            "Curl tongue without touching roof of mouth",
+            "Don't roll the R like in Vietnamese",
+            "Tongue should float freely",
+        ],
+        "L": [
+            "Touch tongue tip to roof of mouth behind teeth",
+            "Let air flow around sides of tongue",
+            "Make sure tongue actually touches",
+        ],
+        "Z": [
+            "Same tongue position as 'S'",
+            "Add vocal cord vibration",
+            "Should buzz like a bee",
+        ],
+    }
+    return tips_dict.get(phoneme, ["Practice this sound slowly and clearly"])
+def get_word_pronunciation_tips(word: str, phonemes: List[str]) -> List[str]:
+    """Get word-specific pronunciation tips"""
+    tips = []
+    # Check for challenging combinations
+    phoneme_str = " ".join(phonemes)
+    # Consonant clusters
+    if "S T" in phoneme_str or "S K" in phoneme_str or "S P" in phoneme_str:
+        tips.append("Practice the consonant cluster slowly, then speed up")
+    # TH sounds
+    if "TH" in phonemes:
+        tips.append("Remember: tongue between teeth for TH sounds")
+    # R and L distinction
+    if "R" in phonemes and "L" in phonemes:
+        tips.append("Focus on R (no touching) vs L (tongue touches roof)")
+    # Final consonants (Vietnamese tendency to drop)
+    final_phoneme = phonemes[-1] if phonemes else ""
+    if final_phoneme in ["T", "D", "K", "G", "P", "B"]:
+        tips.append("Don't forget the final consonant sound")
+    # Vowel length
+    vowel_phonemes = [
+        p for p in phonemes if re.sub(r"[0-9]", "", p) in ["IY", "UW", "AO"]
+    ]
+    if vowel_phonemes:
+        tips.append("Make sure long vowels are actually longer")
+    if not tips:
+        tips.append("Break the word into syllables and practice each part")
+    return tips
+def get_text_recommendations(text_analysis: Dict) -> List[str]:
+    """Get recommendations based on text analysis"""
+    recommendations = []
+    difficulty = text_analysis["difficulty_score"]
+    if difficulty < 0.3:
+        recommendations.append(
+            "This text is good for beginners. Try adding more challenging words gradually."
+        )
+    elif difficulty > 0.8:
+        recommendations.append(
+            "This is very challenging text. Consider starting with easier words first."
+        )
+    challenging_sounds = text_analysis["challenging_sounds"]
+    if len(challenging_sounds) > 5:
+        recommendations.append(
+            "This text has many challenging sounds. Practice individual words first."
+        )
+    # Word length recommendations
+    long_words = [w for w in text_analysis["words"] if len(w["phonemes"]) > 8]
+    if long_words:
+        recommendations.append(
+            "Break down longer words into syllables for easier practice."
+        )
+    return recommendations
+# =============================================================================
+# ADDITIONAL ENDPOINTS
+# =============================================================================
+@router.get("/stats")
+async def get_system_stats():
+    """Get system statistics"""
+    cmu_dict = assessor.phoneme_processor.cmu_dict
+    return {
+        "total_words_supported": len(cmu_dict),
+        "phonemes_supported": len(assessor.phoneme_processor.phoneme_models),
+        "difficulty_levels": ["easy", "medium", "hard"],
+        "audio_formats_supported": ["wav", "mp3", "m4a", "flac"],
+        "max_audio_duration": "30 seconds",
+        "vietnamese_specific_features": True,
+        "features": [
+            "CMU Pronouncing Dictionary integration",
+            "IPA transcription",
+            "Syllable analysis",
+            "Contextual phoneme scoring",
+            "Vietnamese learner optimization",
+        ],
+    }
+@router.get("/phonemes/difficult")
+async def get_difficult_phonemes_for_vietnamese():
+    """Get phonemes that are most difficult for Vietnamese speakers"""
+    difficult_phonemes = []
+    for phoneme, difficulty in assessor.phoneme_processor.difficulty_map.items():
+        if difficulty > 0.6:  # Only include challenging ones
+            difficult_phonemes.append(
+                {
+                    "phoneme": phoneme,
+                    "difficulty": difficulty,
+                    "tips": get_phoneme_tips(phoneme),
+                    "example_words": get_example_words(phoneme),
+                }
+            )
+    # Sort by difficulty (hardest first)
+    difficult_phonemes.sort(key=lambda x: x["difficulty"], reverse=True)
     return {
+        "difficult_phonemes": difficult_phonemes,
+        "total_count": len(difficult_phonemes),
+        "recommendation": "Focus on the top 5 most difficult sounds first",
     }
+def get_example_words(phoneme: str) -> List[str]:
+    """Get example words containing the phoneme"""
+    examples = {
+        "TH": ["think", "three", "math", "path"],
+        "DH": ["this", "that", "mother", "weather"],
+        "V": ["very", "love", "give", "have"],
+        "Z": ["zoo", "zero", "buzz", "rise"],
+        "R": ["red", "car", "very", "right"],
+        "L": ["love", "hello", "well", "people"],
+        "W": ["water", "well", "what", "sweet"],
+        "ZH": ["measure", "vision", "treasure"],
+        "CH": ["chair", "much", "teach"],
+        "JH": ["job", "bridge", "age"],
+        "SH": ["shoe", "fish", "nation"],
+        "NG": ["ring", "thing", "young"],
+    }
+    return examples.get(phoneme, [f"word_with_{phoneme.lower()}"])