Spaces:

ABAO77
/

Run_code_api

Sleeping

File size: 21,406 Bytes

from typing import List, Dict
import numpy as np
import nltk
import eng_to_ipa as ipa
import re
from collections import defaultdict


try:
    nltk.download("cmudict", quiet=True)
    from nltk.corpus import cmudict
except:
    print("Warning: NLTK data not available")


class SimpleG2P:
    """Simple Grapheme-to-Phoneme converter for reference text"""

    def __init__(self):
        try:
            self.cmu_dict = cmudict.dict()
        except:
            self.cmu_dict = {}
            print("Warning: CMU dictionary not available")

    def text_to_phonemes(self, text: str) -> List[Dict]:
        """Convert text to phoneme sequence"""
        words = self._clean_text(text).split()
        phoneme_sequence = []

        for word in words:
            word_phonemes = self._get_word_phonemes(word)
            phoneme_sequence.append(
                {
                    "word": word,
                    "phonemes": word_phonemes,
                    "ipa": self._get_ipa(word),
                    "phoneme_string": " ".join(word_phonemes),
                }
            )

        return phoneme_sequence

    def get_reference_phoneme_string(self, text: str) -> str:
        """Get reference phoneme string for comparison"""
        phoneme_sequence = self.text_to_phonemes(text)
        all_phonemes = []

        for word_data in phoneme_sequence:
            all_phonemes.extend(word_data["phonemes"])

        return " ".join(all_phonemes)

    def _clean_text(self, text: str) -> str:
        """Clean text for processing"""
        text = re.sub(r"[^\w\s\']", " ", text)
        text = re.sub(r"\s+", " ", text)
        return text.lower().strip()

    def _get_word_phonemes(self, word: str) -> List[str]:
        """Get phonemes for a word"""
        word_lower = word.lower()

        if word_lower in self.cmu_dict:
            # Remove stress markers and convert to Wav2Vec2 phoneme format
            phonemes = self.cmu_dict[word_lower][0]
            clean_phonemes = [re.sub(r"[0-9]", "", p) for p in phonemes]
            return self._convert_to_wav2vec_format(clean_phonemes)
        else:
            return self._estimate_phonemes(word)

    def _convert_to_wav2vec_format(self, cmu_phonemes: List[str]) -> List[str]:
        """Convert CMU phonemes to Wav2Vec2 format"""
        # Mapping from CMU to Wav2Vec2/eSpeak phonemes
        cmu_to_espeak = {
            "AA": "ɑ",
            "AE": "æ",
            "AH": "ʌ",
            "AO": "ɔ",
            "AW": "aʊ",
            "AY": "aɪ",
            "EH": "ɛ",
            "ER": "ɝ",
            "EY": "eɪ",
            "IH": "ɪ",
            "IY": "i",
            "OW": "oʊ",
            "OY": "ɔɪ",
            "UH": "ʊ",
            "UW": "u",
            "B": "b",
            "CH": "tʃ",
            "D": "d",
            "DH": "ð",
            "F": "f",
            "G": "ɡ",
            "HH": "h",
            "JH": "dʒ",
            "K": "k",
            "L": "l",
            "M": "m",
            "N": "n",
            "NG": "ŋ",
            "P": "p",
            "R": "r",
            "S": "s",
            "SH": "ʃ",
            "T": "t",
            "TH": "θ",
            "V": "v",
            "W": "w",
            "Y": "j",
            "Z": "z",
            "ZH": "ʒ",
        }

        converted = []
        for phoneme in cmu_phonemes:
            converted_phoneme = cmu_to_espeak.get(phoneme, phoneme.lower())
            converted.append(converted_phoneme)

        return converted

    def _get_ipa(self, word: str) -> str:
        """Get IPA transcription"""
        try:
            return ipa.convert(word)
        except:
            return f"/{word}/"

    def _estimate_phonemes(self, word: str) -> List[str]:
        """Estimate phonemes for unknown words"""
        # Basic phoneme estimation with eSpeak-style output
        phoneme_map = {
            "ch": ["tʃ"],
            "sh": ["ʃ"],
            "th": ["θ"],
            "ph": ["f"],
            "ck": ["k"],
            "ng": ["ŋ"],
            "qu": ["k", "w"],
            "a": ["æ"],
            "e": ["ɛ"],
            "i": ["ɪ"],
            "o": ["ʌ"],
            "u": ["ʌ"],
            "b": ["b"],
            "c": ["k"],
            "d": ["d"],
            "f": ["f"],
            "g": ["ɡ"],
            "h": ["h"],
            "j": ["dʒ"],
            "k": ["k"],
            "l": ["l"],
            "m": ["m"],
            "n": ["n"],
            "p": ["p"],
            "r": ["r"],
            "s": ["s"],
            "t": ["t"],
            "v": ["v"],
            "w": ["w"],
            "x": ["k", "s"],
            "y": ["j"],
            "z": ["z"],
        }

        word = word.lower()
        phonemes = []
        i = 0

        while i < len(word):
            # Check 2-letter combinations first
            if i <= len(word) - 2:
                two_char = word[i : i + 2]
                if two_char in phoneme_map:
                    phonemes.extend(phoneme_map[two_char])
                    i += 2
                    continue

            # Single character
            char = word[i]
            if char in phoneme_map:
                phonemes.extend(phoneme_map[char])

            i += 1

        return phonemes


class PhonemeComparator:
    """Compare reference and learner phoneme sequences"""

    def __init__(self):
        # Vietnamese speakers' common phoneme substitutions
        self.substitution_patterns = {
            "θ": ["f", "s", "t"],  # TH → F, S, T
            "ð": ["d", "z", "v"],  # DH → D, Z, V
            "v": ["w", "f"],  # V → W, F
            "r": ["l"],  # R → L
            "l": ["r"],  # L → R
            "z": ["s"],  # Z → S
            "ʒ": ["ʃ", "z"],  # ZH → SH, Z
            "ŋ": ["n"],  # NG → N
        }

        # Difficulty levels for Vietnamese speakers
        self.difficulty_map = {
            "θ": 0.9,  # th (think)
            "ð": 0.9,  # th (this)
            "v": 0.8,  # v
            "z": 0.8,  # z
            "ʒ": 0.9,  # zh (measure)
            "r": 0.7,  # r
            "l": 0.6,  # l
            "w": 0.5,  # w
            "f": 0.4,  # f
            "s": 0.3,  # s
            "ʃ": 0.5,  # sh
            "tʃ": 0.4,  # ch
            "dʒ": 0.5,  # j
            "ŋ": 0.3,  # ng
        }

    def compare_phoneme_sequences(
        self, reference_phonemes: str, learner_phonemes: str
    ) -> List[Dict]:
        """Compare reference and learner phoneme sequences"""

        # Split phoneme strings
        ref_phones = reference_phonemes.split()
        learner_phones = learner_phonemes.split()

        print(f"Reference phonemes: {ref_phones}")
        print(f"Learner phonemes: {learner_phones}")

        # Simple alignment comparison
        comparisons = []
        max_len = max(len(ref_phones), len(learner_phones))

        for i in range(max_len):
            ref_phoneme = ref_phones[i] if i < len(ref_phones) else ""
            learner_phoneme = learner_phones[i] if i < len(learner_phones) else ""

            if ref_phoneme and learner_phoneme:
                # Both present - check accuracy
                if ref_phoneme == learner_phoneme:
                    status = "correct"
                    score = 1.0
                elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
                    status = "acceptable"
                    score = 0.7
                else:
                    status = "wrong"
                    score = 0.2

            elif ref_phoneme and not learner_phoneme:
                # Missing phoneme
                status = "missing"
                score = 0.0

            elif learner_phoneme and not ref_phoneme:
                # Extra phoneme
                status = "extra"
                score = 0.0
            else:
                continue

            comparison = {
                "position": i,
                "reference_phoneme": ref_phoneme,
                "learner_phoneme": learner_phoneme,
                "status": status,
                "score": score,
                "difficulty": self.difficulty_map.get(ref_phoneme, 0.3),
            }

            comparisons.append(comparison)

        return comparisons

    def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
        """Check if learner phoneme is acceptable substitution for Vietnamese speakers"""
        acceptable = self.substitution_patterns.get(reference, [])
        return learner in acceptable


# =============================================================================
# WORD ANALYZER
# =============================================================================


class WordAnalyzer:
    """Analyze word-level pronunciation accuracy using character-based ASR"""

    def __init__(self):
        self.g2p = SimpleG2P()
        self.comparator = PhonemeComparator()

    def analyze_words(self, reference_text: str, learner_phonemes: str) -> Dict:
        """Analyze word-level pronunciation using phoneme representation from character ASR"""

        # Get reference phonemes by word
        reference_words = self.g2p.text_to_phonemes(reference_text)

        # Get overall phoneme comparison
        reference_phoneme_string = self.g2p.get_reference_phoneme_string(reference_text)
        phoneme_comparisons = self.comparator.compare_phoneme_sequences(
            reference_phoneme_string, learner_phonemes
        )

        # Map phonemes back to words
        word_highlights = self._create_word_highlights(
            reference_words, phoneme_comparisons
        )

        # Identify wrong words
        wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)

        return {
            "word_highlights": word_highlights,
            "phoneme_differences": phoneme_comparisons,
            "wrong_words": wrong_words,
        }

    def _create_word_highlights(
        self, reference_words: List[Dict], phoneme_comparisons: List[Dict]
    ) -> List[Dict]:
        """Create word highlighting data"""

        word_highlights = []
        phoneme_index = 0

        for word_data in reference_words:
            word = word_data["word"]
            word_phonemes = word_data["phonemes"]
            num_phonemes = len(word_phonemes)

            # Get phoneme scores for this word
            word_phoneme_scores = []
            for j in range(num_phonemes):
                if phoneme_index + j < len(phoneme_comparisons):
                    comparison = phoneme_comparisons[phoneme_index + j]
                    word_phoneme_scores.append(comparison["score"])

            # Calculate word score
            word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0

            # Create word highlight
            highlight = {
                "word": word,
                "score": float(word_score),
                "status": self._get_word_status(word_score),
                "color": self._get_word_color(word_score),
                "phonemes": word_phonemes,
                "ipa": word_data["ipa"],
                "phoneme_scores": word_phoneme_scores,
                "phoneme_start_index": phoneme_index,
                "phoneme_end_index": phoneme_index + num_phonemes - 1,
            }

            word_highlights.append(highlight)
            phoneme_index += num_phonemes

        return word_highlights

    def _identify_wrong_words(
        self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
    ) -> List[Dict]:
        """Identify words that were pronounced incorrectly"""

        wrong_words = []

        for word_highlight in word_highlights:
            if word_highlight["score"] < 0.6:  # Threshold for wrong pronunciation

                # Find specific phoneme errors for this word
                start_idx = word_highlight["phoneme_start_index"]
                end_idx = word_highlight["phoneme_end_index"]

                wrong_phonemes = []
                missing_phonemes = []

                for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
                    comparison = phoneme_comparisons[i]

                    if comparison["status"] == "wrong":
                        wrong_phonemes.append(
                            {
                                "expected": comparison["reference_phoneme"],
                                "actual": comparison["learner_phoneme"],
                                "difficulty": comparison["difficulty"],
                            }
                        )
                    elif comparison["status"] == "missing":
                        missing_phonemes.append(
                            {
                                "phoneme": comparison["reference_phoneme"],
                                "difficulty": comparison["difficulty"],
                            }
                        )

                wrong_word = {
                    "word": word_highlight["word"],
                    "score": word_highlight["score"],
                    "expected_phonemes": word_highlight["phonemes"],
                    "ipa": word_highlight["ipa"],
                    "wrong_phonemes": wrong_phonemes,
                    "missing_phonemes": missing_phonemes,
                    "tips": self._get_vietnamese_tips(wrong_phonemes, missing_phonemes),
                }

                wrong_words.append(wrong_word)

        return wrong_words

    def _get_word_status(self, score: float) -> str:
        """Get word status from score"""
        if score >= 0.8:
            return "excellent"
        elif score >= 0.6:
            return "good"
        elif score >= 0.4:
            return "needs_practice"
        else:
            return "poor"

    def _get_word_color(self, score: float) -> str:
        """Get color for word highlighting"""
        if score >= 0.8:
            return "#22c55e"  # Green
        elif score >= 0.6:
            return "#84cc16"  # Light green
        elif score >= 0.4:
            return "#eab308"  # Yellow
        else:
            return "#ef4444"  # Red

    def _get_vietnamese_tips(
        self, wrong_phonemes: List[Dict], missing_phonemes: List[Dict]
    ) -> List[str]:
        """Get Vietnamese-specific pronunciation tips"""

        tips = []

        # Tips for specific Vietnamese pronunciation challenges
        vietnamese_tips = {
            "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
            "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
            "v": "Chạm môi dưới vào răng trên, không dùng cả hai môi như tiếng Việt",
            "r": "Cuộn lưỡi nhưng không chạm vào vòm miệng, không lăn lưỡi",
            "l": "Đầu lưỡi chạm vào vòm miệng sau răng",
            "z": "Giống âm 's' nhưng có rung dây thanh âm",
            "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
            "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'",
        }

        # Add tips for wrong phonemes
        for wrong in wrong_phonemes:
            expected = wrong["expected"]
            actual = wrong["actual"]

            if expected in vietnamese_tips:
                tips.append(f"Âm '{expected}': {vietnamese_tips[expected]}")
            else:
                tips.append(f"Luyện âm '{expected}' thay vì '{actual}'")

        # Add tips for missing phonemes
        for missing in missing_phonemes:
            phoneme = missing["phoneme"]
            if phoneme in vietnamese_tips:
                tips.append(f"Thiếu âm '{phoneme}': {vietnamese_tips[phoneme]}")

        return tips


class SimpleFeedbackGenerator:
    """Generate simple, actionable feedback in Vietnamese"""

    def generate_feedback(
        self,
        overall_score: float,
        wrong_words: List[Dict],
        phoneme_comparisons: List[Dict],
    ) -> List[str]:
        """Generate focused Vietnamese feedback with actionable improvements"""

        feedback = []

        # More specific and actionable feedback based on score ranges
        if overall_score >= 0.8:
            feedback.append(f"Xuất sắc! Điểm: {int(overall_score * 100)}%. Tiếp tục duy trì và luyện tập thêm tốc độ tự nhiên.")
        elif overall_score >= 0.7:
            feedback.append(f"Tốt! Điểm: {int(overall_score * 100)}%. Để đạt 80%+, hãy tập trung vào nhịp điệu và ngữ điệu.")
        elif overall_score >= 0.6:
            feedback.append(f"Khá! Điểm: {int(overall_score * 100)}%. Để cải thiện, hãy phát âm chậm hơn và rõ ràng từng âm.")
        elif overall_score >= 0.4:
            feedback.append(f"Cần cải thiện. Điểm: {int(overall_score * 100)}%. Nghe lại mẫu và tập từng từ riêng lẻ trước.")
        else:
            feedback.append(f"Điểm: {int(overall_score * 100)}%. Hãy nghe mẫu 3-5 lần, sau đó tập phát âm từng từ chậm rãi.")

        # More specific wrong words feedback with improvement path
        if wrong_words:
            # Sort by score to focus on worst words first
            sorted_words = sorted(wrong_words, key=lambda x: x["score"])
            
            if len(wrong_words) == 1:
                word = sorted_words[0]
                feedback.append(f"Tập trung vào từ '{word['word']}' (điểm: {int(word['score']*100)}%). Click vào từ để nghe lại.")
            elif len(wrong_words) <= 3:
                worst_word = sorted_words[0]
                feedback.append(f"Ưu tiên cải thiện: '{worst_word['word']}' ({int(worst_word['score']*100)}%) - các từ khác sẽ dễ hơn sau khi nắm được từ này.")
            else:
                # Focus on pattern recognition
                feedback.append(f"Có {len(wrong_words)} từ cần cải thiện. Bắt đầu với 2 từ khó nhất và luyện tập 5 lần mỗi từ.")

        # Specific phoneme guidance with improvement strategy
        problem_phonemes = defaultdict(int)
        for comparison in phoneme_comparisons:
            if comparison["status"] in ["wrong", "missing"]:
                phoneme = comparison["reference_phoneme"]
                problem_phonemes[phoneme] += 1

        if problem_phonemes:
            most_difficult = sorted(
                problem_phonemes.items(), key=lambda x: x[1], reverse=True
            )
            top_problems = most_difficult[:2]  # Focus on top 2 problems

            detailed_phoneme_tips = {
                "θ": "Đặt đầu lưỡi giữa 2 hàm răng, thổi nhẹ ra. Luyện: 'think', 'three', 'thank'.",
                "ð": "Như /θ/ nhưng rung dây thanh. Luyện: 'this', 'that', 'the'.",
                "v": "Răng trên chạm nhẹ môi dưới (không phải 2 môi). Luyện: 'very', 'have', 'love'.",
                "r": "Cuộn lưỡi lên nhưng KHÔNG chạm nóc miệng. Luyện: 'red', 'run', 'car'.",
                "l": "Đầu lưỡi chạm nướu răng trên. Luyện: 'love', 'like', 'tell'.",
                "z": "Như 's' nhưng rung dây thanh (đặt tay vào cổ để cảm nhận). Luyện: 'zoo', 'buzz'.",
                "ɛ": "Mở miệng vừa, lưỡi thấp (như 'e' trong 'ten'). Luyện: 'bed', 'red', 'get'.",
                "æ": "Mở miệng rộng, hàm dưới hạ thấp. Luyện: 'cat', 'man', 'bad'.",
                "ɪ": "Âm 'i' ngắn, lưỡi thả lỏng. Luyện: 'sit', 'big', 'this'.",
                "ʊ": "Âm 'u' ngắn, môi tròn nhẹ. Luyện: 'book', 'put', 'could'.",
            }

            # Provide specific guidance for the most problematic phoneme
            for phoneme, count in top_problems[:1]:  # Focus on the worst one
                if phoneme in detailed_phoneme_tips:
                    improvement = 100 - int((count / len(phoneme_comparisons)) * 100)
                    feedback.append(
                        f"🎯 Tập trung âm /{phoneme}/: {detailed_phoneme_tips[phoneme]} Cải thiện âm này sẽ tăng điểm ~{improvement}%."
                    )

        # Add specific action steps based on score range
        if overall_score < 0.8:
            if overall_score < 0.5:
                feedback.append("📚 Bước tiếp: 1) Nghe mẫu 5 lần, 2) Tập phát âm từng từ 3 lần, 3) Ghi âm lại và so sánh.")
            elif overall_score < 0.7:
                feedback.append("📚 Bước tiếp: 1) Tập từ khó nhất 5 lần, 2) Đọc cả câu chậm 2 lần, 3) Tăng tốc độ dần.")
            else:
                feedback.append("📚 Bước tiếp: 1) Luyện ngữ điệu tự nhiên, 2) Kết nối âm giữa các từ, 3) Tập nói với cảm xúc.")

        return feedback


def convert_numpy_types(obj):
    """Convert numpy types to Python native types"""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    else:
        return obj