Spaces:
Sleeping
Sleeping
| from fastapi import UploadFile, File, Form, HTTPException, APIRouter | |
| from pydantic import BaseModel | |
| from typing import List, Dict, Optional | |
| import tempfile | |
| import numpy as np | |
| import re | |
| import warnings | |
| from loguru import logger | |
| from src.utils.speaking_utils import convert_numpy_types | |
| # Import the new evaluation system | |
| from evalution import ProductionPronunciationAssessor, EnhancedG2P | |
| warnings.filterwarnings("ignore") | |
| router = APIRouter(prefix="/speaking", tags=["Speaking"]) | |
| class PronunciationAssessmentResult(BaseModel): | |
| transcript: str # What the user actually said (character transcript) | |
| transcript_phonemes: str # User's phonemes | |
| user_phonemes: str # Alias for transcript_phonemes for UI clarity | |
| user_ipa: Optional[str] = None # User's IPA notation | |
| reference_ipa: str # Reference IPA notation | |
| reference_phonemes: str # Reference phonemes | |
| character_transcript: str | |
| overall_score: float | |
| word_highlights: List[Dict] | |
| phoneme_differences: List[Dict] | |
| wrong_words: List[Dict] | |
| feedback: List[str] | |
| processing_info: Dict | |
| # Enhanced features | |
| phoneme_pairs: Optional[List[Dict]] = None | |
| phoneme_comparison: Optional[Dict] = None | |
| prosody_analysis: Optional[Dict] = None | |
| assessment_mode: Optional[str] = None | |
| character_level_analysis: Optional[bool] = None | |
| assessor = ProductionPronunciationAssessor() | |
| async def assess_pronunciation( | |
| audio_file: UploadFile = File(..., description="Audio file (.wav, .mp3, .m4a)"), | |
| reference_text: str = Form(..., description="Reference text to pronounce"), | |
| mode: str = Form( | |
| "auto", | |
| description="Assessment mode: 'word', 'sentence', or 'auto' (determined by text length)", | |
| ), | |
| ): | |
| """ | |
| Enhanced Pronunciation Assessment API with word/sentence mode support | |
| Key Features: | |
| - Word mode: For single words or short phrases (1-3 words) | |
| - Sentence mode: For longer sentences with prosody analysis | |
| - Advanced phoneme comparison using Levenshtein distance | |
| - Prosody analysis (pitch, rhythm, intensity) for sentence mode | |
| - Detailed phoneme pair visualization | |
| - Vietnamese-optimized feedback and tips | |
| Input: Audio file + Reference text + Mode | |
| Output: Enhanced assessment results with visualization data | |
| """ | |
| import time | |
| start_time = time.time() | |
| # Validate mode and set to auto if invalid | |
| if mode not in ["word", "sentence", "auto"]: | |
| mode = "auto" # Set to auto as default instead of throwing error | |
| logger.info(f"Invalid mode '{mode}' provided, defaulting to 'auto' mode") | |
| # Validate inputs | |
| if not reference_text.strip(): | |
| raise HTTPException(status_code=400, detail="Reference text cannot be empty") | |
| if len(reference_text) > 500: | |
| raise HTTPException( | |
| status_code=400, detail="Reference text too long (max 500 characters)" | |
| ) | |
| # Check for valid English characters | |
| if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text): | |
| raise HTTPException( | |
| status_code=400, | |
| detail="Text must contain only English letters, spaces, and basic punctuation", | |
| ) | |
| try: | |
| # Save uploaded file temporarily | |
| file_extension = ".wav" | |
| if audio_file.filename and "." in audio_file.filename: | |
| file_extension = f".{audio_file.filename.split('.')[-1]}" | |
| with tempfile.NamedTemporaryFile( | |
| delete=False, suffix=file_extension | |
| ) as tmp_file: | |
| content = await audio_file.read() | |
| tmp_file.write(content) | |
| tmp_file.flush() | |
| logger.info(f"Processing audio file: {tmp_file.name} with mode: {mode}") | |
| # Run assessment using enhanced assessor | |
| result = assessor.assess_pronunciation(tmp_file.name, reference_text, mode) | |
| # Get reference phonemes and IPA | |
| g2p = EnhancedG2P() | |
| reference_words = reference_text.strip().split() | |
| reference_phonemes_list = [] | |
| reference_ipa_list = [] | |
| for word in reference_words: | |
| word_phonemes = g2p.text_to_phonemes(word.strip('.,!?;:'))[0] | |
| reference_phonemes_list.append(word_phonemes["phoneme_string"]) | |
| reference_ipa_list.append(word_phonemes["ipa"]) | |
| # Join phonemes and IPA for the full text | |
| result["reference_phonemes"] = " ".join(reference_phonemes_list) | |
| result["reference_ipa"] = " ".join(reference_ipa_list) | |
| # Create user_ipa from transcript using G2P (same way as reference) | |
| if "transcript" in result and result["transcript"]: | |
| try: | |
| user_transcript = result["transcript"].strip() | |
| user_words = user_transcript.split() | |
| user_ipa_list = [] | |
| for word in user_words: | |
| clean_word = word.strip('.,!?;:').lower() | |
| if clean_word: # Skip empty words | |
| try: | |
| word_phonemes = g2p.text_to_phonemes(clean_word)[0] | |
| user_ipa_list.append(word_phonemes["ipa"]) | |
| except Exception as e: | |
| logger.warning(f"Failed to get IPA for word '{clean_word}': {e}") | |
| # Fallback: use the word itself | |
| user_ipa_list.append(f"/{clean_word}/") | |
| result["user_ipa"] = " ".join(user_ipa_list) if user_ipa_list else None | |
| logger.info(f"Generated user IPA from transcript '{user_transcript}': '{result['user_ipa']}'") | |
| except Exception as e: | |
| logger.warning(f"Failed to generate user IPA from transcript: {e}") | |
| result["user_ipa"] = None | |
| else: | |
| result["user_ipa"] = None | |
| # Add processing time | |
| processing_time = time.time() - start_time | |
| result["processing_info"]["processing_time"] = processing_time | |
| # Convert numpy types for JSON serialization | |
| final_result = convert_numpy_types(result) | |
| logger.info( | |
| f"Assessment completed in {processing_time:.2f} seconds using {mode} mode" | |
| ) | |
| return PronunciationAssessmentResult(**final_result) | |
| except Exception as e: | |
| logger.error(f"Assessment error: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| raise HTTPException(status_code=500, detail=f"Assessment failed: {str(e)}") | |
| # ============================================================================= | |
| # UTILITY ENDPOINTS | |
| # ============================================================================= | |
| def get_word_phonemes(word: str): | |
| """Get phoneme breakdown for a specific word""" | |
| try: | |
| # Use the new EnhancedG2P from evaluation module | |
| from evalution import EnhancedG2P | |
| g2p = EnhancedG2P() | |
| phoneme_data = g2p.text_to_phonemes(word)[0] | |
| # Add difficulty analysis for Vietnamese speakers | |
| difficulty_scores = [] | |
| for phoneme in phoneme_data["phonemes"]: | |
| difficulty = g2p.get_difficulty_score(phoneme) | |
| difficulty_scores.append(difficulty) | |
| avg_difficulty = float(np.mean(difficulty_scores)) if difficulty_scores else 0.3 | |
| return { | |
| "word": word, | |
| "phonemes": phoneme_data["phonemes"], | |
| "phoneme_string": phoneme_data["phoneme_string"], | |
| "ipa": phoneme_data["ipa"], | |
| "difficulty_score": avg_difficulty, | |
| "difficulty_level": ( | |
| "hard" | |
| if avg_difficulty > 0.6 | |
| else "medium" if avg_difficulty > 0.4 else "easy" | |
| ), | |
| "challenging_phonemes": [ | |
| { | |
| "phoneme": p, | |
| "difficulty": g2p.get_difficulty_score(p), | |
| "vietnamese_tip": get_vietnamese_tip(p), | |
| } | |
| for p in phoneme_data["phonemes"] | |
| if g2p.get_difficulty_score(p) > 0.6 | |
| ], | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Word analysis error: {str(e)}") | |
| def get_vietnamese_tip(phoneme: str) -> str: | |
| """Get Vietnamese pronunciation tip for a phoneme""" | |
| tips = { | |
| "θ": "Đặt lưỡi giữa răng, thổi nhẹ", | |
| "ð": "Giống θ nhưng rung dây thanh âm", | |
| "v": "Môi dưới chạm răng trên", | |
| "r": "Cuộn lưỡi, không chạm vòm miệng", | |
| "l": "Lưỡi chạm vòm miệng sau răng", | |
| "z": "Như 's' nhưng rung dây thanh", | |
| "ʒ": "Như 'ʃ' nhưng rung dây thanh", | |
| "w": "Tròn môi như 'u'", | |
| } | |
| return tips.get(phoneme, f"Luyện âm {phoneme}") |