ABAO77 commited on
Commit
dd47219
·
1 Parent(s): 9c76eb3

Implement feature X to enhance user experience and fix bug Y in module Z

Browse files
Files changed (1) hide show
  1. src/apis/routes/speaking_route.py +1381 -467
src/apis/routes/speaking_route.py CHANGED
@@ -1,16 +1,19 @@
1
- # SIMPLIFIED PRONUNCIATION ASSESSMENT API
2
- # Input: Audio + Reference Text Output: Word highlights + Phoneme diff + Wrong words
3
 
4
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException, APIRouter
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from pydantic import BaseModel
7
- from typing import List, Dict, Optional
8
  import tempfile
9
  import os
10
  import numpy as np
 
11
  import nltk
12
  import eng_to_ipa as ipa
13
- import whisper
 
 
14
  import re
15
  from collections import defaultdict
16
  import warnings
@@ -20,6 +23,7 @@ warnings.filterwarnings("ignore")
20
  # Download required NLTK data
21
  try:
22
  nltk.download("cmudict", quiet=True)
 
23
  from nltk.corpus import cmudict
24
  except:
25
  print("Warning: NLTK data not available")
@@ -27,74 +31,147 @@ except:
27
  # =============================================================================
28
  # MODELS
29
  # =============================================================================
 
30
 
31
- router = APIRouter(prefix="/pronunciation", tags=["Pronunciation"])
32
 
33
-
34
- class PronunciationAssessmentResult(BaseModel):
35
- transcript: str
36
  overall_score: float
37
- word_highlights: List[Dict]
38
- phoneme_differences: List[Dict]
39
- wrong_words: List[Dict]
40
  feedback: List[str]
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  # =============================================================================
44
- # CORE COMPONENTS
45
  # =============================================================================
46
 
47
 
48
- class SimpleG2P:
49
- """Simple Grapheme-to-Phoneme converter"""
50
 
51
  def __init__(self):
 
 
 
52
  try:
53
  self.cmu_dict = cmudict.dict()
54
  except:
55
  self.cmu_dict = {}
56
  print("Warning: CMU dictionary not available")
57
 
58
- def text_to_phonemes(self, text: str) -> List[Dict]:
59
- """Convert text to phoneme sequence"""
60
- words = self._clean_text(text).split()
61
- phoneme_sequence = []
62
-
63
- for word in words:
64
- word_phonemes = self._get_word_phonemes(word)
65
- phoneme_sequence.append(
66
- {"word": word, "phonemes": word_phonemes, "ipa": self._get_ipa(word)}
67
- )
68
-
69
- return phoneme_sequence
70
 
71
- def _clean_text(self, text: str) -> str:
72
- """Clean text for processing"""
73
- text = re.sub(r"[^\w\s\']", " ", text)
74
- text = re.sub(r"\s+", " ", text)
75
- return text.lower().strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- def _get_word_phonemes(self, word: str) -> List[str]:
78
- """Get phonemes for a word"""
79
- word_lower = word.lower()
80
 
 
 
81
  if word_lower in self.cmu_dict:
82
- # Remove stress markers
83
- phonemes = self.cmu_dict[word_lower][0]
84
- return [re.sub(r"[0-9]", "", p) for p in phonemes]
85
- else:
86
- # Simple fallback
87
- return self._estimate_phonemes(word)
 
 
 
 
 
88
 
89
- def _get_ipa(self, word: str) -> str:
90
- """Get IPA transcription"""
91
  try:
92
- return ipa.convert(word)
 
 
 
 
 
 
 
 
 
93
  except:
94
- return f"/{word}/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  def _estimate_phonemes(self, word: str) -> List[str]:
97
  """Estimate phonemes for unknown words"""
 
98
  phoneme_map = {
99
  "ch": ["CH"],
100
  "sh": ["SH"],
@@ -136,7 +213,7 @@ class SimpleG2P:
136
 
137
  while i < len(word):
138
  # Check 2-letter combinations first
139
- if i <= len(word) - 2:
140
  two_char = word[i : i + 2]
141
  if two_char in phoneme_map:
142
  phonemes.extend(phoneme_map[two_char])
@@ -152,343 +229,921 @@ class SimpleG2P:
152
 
153
  return phonemes
154
 
155
-
156
- class SimplePhonemeComparator:
157
- """Simple phoneme comparison"""
158
-
159
- def __init__(self):
160
- # Vietnamese difficulty map
161
- self.difficulty_map = {
162
- "TH": 0.9,
163
- "DH": 0.9,
164
- "V": 0.8,
165
- "Z": 0.8,
166
- "ZH": 0.9,
167
- "R": 0.7,
168
- "L": 0.6,
169
- "W": 0.5,
170
- "F": 0.4,
171
- "S": 0.3,
172
- "SH": 0.5,
173
- "CH": 0.4,
174
- "JH": 0.5,
175
- "NG": 0.3,
176
  }
177
 
178
- # Common substitution patterns for Vietnamese speakers
179
- self.substitution_patterns = {
180
- "TH": ["F", "S", "T"],
181
- "DH": ["D", "Z", "V"],
182
- "V": ["W", "F"],
183
- "R": ["L"],
184
- "L": ["R"],
185
- "Z": ["S"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  }
187
 
188
- def compare_phonemes(
189
- self, reference_phonemes: List[Dict], learner_phonemes: List[Dict]
190
- ) -> List[Dict]:
191
- """Compare reference and learner phoneme sequences"""
192
-
193
- # Flatten phoneme sequences
194
- ref_sequence = []
195
- learner_sequence = []
196
-
197
- for word_data in reference_phonemes:
198
- for phoneme in word_data["phonemes"]:
199
- ref_sequence.append({"phoneme": phoneme, "word": word_data["word"]})
200
-
201
- for word_data in learner_phonemes:
202
- for phoneme in word_data["phonemes"]:
203
- learner_sequence.append({"phoneme": phoneme, "word": word_data["word"]})
204
-
205
- # Simple alignment and comparison
206
- comparisons = []
207
- max_len = max(len(ref_sequence), len(learner_sequence))
208
-
209
- for i in range(max_len):
210
- ref_item = ref_sequence[i] if i < len(ref_sequence) else None
211
- learner_item = learner_sequence[i] if i < len(learner_sequence) else None
212
-
213
- if ref_item and learner_item:
214
- ref_phoneme = ref_item["phoneme"]
215
- learner_phoneme = learner_item["phoneme"]
216
-
217
- if ref_phoneme == learner_phoneme:
218
- status = "correct"
219
- score = 1.0
220
- elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
221
- status = "acceptable"
222
- score = 0.7
223
- else:
224
- status = "wrong"
225
- score = 0.3
226
 
227
- comparisons.append(
228
- {
229
- "position": i,
230
- "reference_phoneme": ref_phoneme,
231
- "learner_phoneme": learner_phoneme,
232
- "status": status,
233
- "score": score,
234
- "word": ref_item["word"],
235
- "difficulty": self.difficulty_map.get(ref_phoneme, 0.3),
236
- }
237
- )
238
 
239
- elif ref_item and not learner_item:
240
- # Missing phoneme
241
- comparisons.append(
242
- {
243
- "position": i,
244
- "reference_phoneme": ref_item["phoneme"],
245
- "learner_phoneme": "",
246
- "status": "missing",
247
- "score": 0.0,
248
- "word": ref_item["word"],
249
- "difficulty": self.difficulty_map.get(ref_item["phoneme"], 0.3),
250
- }
251
- )
252
 
253
- elif learner_item and not ref_item:
254
- # Extra phoneme
255
- comparisons.append(
256
- {
257
- "position": i,
258
- "reference_phoneme": "",
259
- "learner_phoneme": learner_item["phoneme"],
260
- "status": "extra",
261
- "score": 0.0,
262
- "word": learner_item["word"],
263
- "difficulty": 0.3,
264
- }
265
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
- return comparisons
268
 
269
- def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
270
- """Check if substitution is acceptable for Vietnamese speakers"""
271
- acceptable = self.substitution_patterns.get(reference, [])
272
- return learner in acceptable
273
 
274
 
275
- class SimplePronunciationAssessor:
276
- """Simplified pronunciation assessor focused on core functionality"""
277
 
278
  def __init__(self):
279
- print("Initializing Whisper model...")
280
- self.whisper_model = whisper.load_model("base.en", in_memory=True)
281
- print("Whisper model loaded successfully")
282
-
283
- self.g2p = SimpleG2P()
284
- self.comparator = SimplePhonemeComparator()
285
  self.sample_rate = 16000
286
 
287
- def assess_pronunciation(self, audio_path: str, reference_text: str) -> Dict:
288
- """Main assessment function"""
 
 
 
 
 
289
 
290
- # Step 1: Whisper ASR
291
- print("Running Whisper transcription...")
292
- asr_result = self.whisper_model.transcribe(audio_path)
293
- transcript = asr_result["text"].strip()
294
- print(f"Transcript: '{transcript}'")
295
 
296
- # Step 2: Get reference phonemes
297
- print("Getting reference phonemes...")
298
- reference_phonemes = self.g2p.text_to_phonemes(reference_text)
299
 
300
- # Step 3: Get learner phonemes from transcript
301
- print("Getting learner phonemes...")
302
- learner_phonemes = self.g2p.text_to_phonemes(transcript)
303
 
304
- # Step 4: Compare phonemes
305
- print("Comparing phonemes...")
306
- phoneme_comparisons = self.comparator.compare_phonemes(
307
- reference_phonemes, learner_phonemes
308
  )
309
 
310
- # Step 5: Generate word highlights
311
- print("Generating word highlights...")
312
- word_highlights = self._generate_word_highlights(
313
- reference_phonemes, learner_phonemes, phoneme_comparisons
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
- # Step 6: Identify wrong words
317
- print("Identifying wrong words...")
318
- wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)
 
319
 
320
- # Step 7: Calculate overall score
321
- overall_score = self._calculate_overall_score(phoneme_comparisons)
 
 
322
 
323
- # Step 8: Generate feedback
324
- feedback = self._generate_simple_feedback(
325
- overall_score, wrong_words, phoneme_comparisons
326
  )
327
 
328
  return {
329
- "transcript": transcript,
330
  "overall_score": overall_score,
331
- "word_highlights": word_highlights,
332
- "phoneme_differences": phoneme_comparisons,
333
- "wrong_words": wrong_words,
334
  "feedback": feedback,
 
 
335
  }
336
 
337
- def _generate_word_highlights(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  self,
339
- reference_phonemes: List[Dict],
340
- learner_phonemes: List[Dict],
341
- phoneme_comparisons: List[Dict],
342
- ) -> List[Dict]:
343
- """Generate word highlighting data"""
344
-
345
- word_highlights = []
346
-
347
- # Group comparisons by word
348
- word_scores = defaultdict(list)
349
- for comparison in phoneme_comparisons:
350
- word = comparison.get("word", "unknown")
351
- if comparison["status"] in ["correct", "acceptable", "wrong"]:
352
- word_scores[word].append(comparison["score"])
353
-
354
- # Create highlights for reference words
355
- for word_data in reference_phonemes:
356
- word = word_data["word"]
357
- scores = word_scores.get(word, [0.0])
358
- avg_score = float(np.mean(scores))
359
-
360
- highlight = {
361
- "word": word,
362
- "score": avg_score,
363
- "status": self._get_word_status(avg_score),
364
- "color": self._get_word_color(avg_score),
365
- "phonemes": word_data["phonemes"],
366
- "ipa": word_data["ipa"],
367
- "issues": self._get_word_issues(word, phoneme_comparisons),
368
  }
369
 
370
- word_highlights.append(highlight)
371
-
372
- return word_highlights
373
-
374
- def _identify_wrong_words(
375
- self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
376
- ) -> List[Dict]:
377
- """Identify words that were pronounced incorrectly"""
378
-
379
- wrong_words = []
380
-
381
- for word_highlight in word_highlights:
382
- if word_highlight["score"] < 0.6: # Threshold for "wrong"
383
- word = word_highlight["word"]
384
-
385
- # Find specific issues for this word
386
- word_issues = []
387
- wrong_phonemes = []
388
- missing_phonemes = []
389
-
390
- for comparison in phoneme_comparisons:
391
- if comparison.get("word") == word:
392
- if comparison["status"] == "wrong":
393
- wrong_phonemes.append(
394
- {
395
- "expected": comparison["reference_phoneme"],
396
- "actual": comparison["learner_phoneme"],
397
- }
398
- )
399
- elif comparison["status"] == "missing":
400
- missing_phonemes.append(comparison["reference_phoneme"])
401
-
402
- if wrong_phonemes:
403
- word_issues.append(
404
- f"Wrong sounds: {', '.join([p['expected'] for p in wrong_phonemes])}"
405
- )
406
 
407
- if missing_phonemes:
408
- word_issues.append(f"Missing sounds: {', '.join(missing_phonemes)}")
409
-
410
- wrong_word = {
411
- "word": word,
412
- "score": word_highlight["score"],
413
- "expected_phonemes": word_highlight["phonemes"],
414
- "ipa": word_highlight["ipa"],
415
- "issues": word_issues,
416
- "wrong_phonemes": wrong_phonemes,
417
- "missing_phonemes": missing_phonemes,
418
- "tips": self._get_pronunciation_tips(
419
- word, wrong_phonemes, missing_phonemes
420
  ),
 
 
 
421
  }
422
 
423
- wrong_words.append(wrong_word)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
- return wrong_words
426
 
427
- def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
428
- """Calculate overall pronunciation score"""
429
- if not phoneme_comparisons:
430
- return 0.0
431
 
432
- total_score = 0.0
433
- for comparison in phoneme_comparisons:
434
- total_score += comparison["score"]
435
 
436
- return total_score / len(phoneme_comparisons)
 
 
437
 
438
- def _generate_simple_feedback(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  self,
 
 
 
440
  overall_score: float,
441
- wrong_words: List[Dict],
442
- phoneme_comparisons: List[Dict],
443
  ) -> List[str]:
444
- """Generate simple, actionable feedback"""
445
-
446
  feedback = []
447
 
448
- # Overall feedback
449
- if overall_score >= 0.8:
450
- feedback.append("Phát âm tốt! Bạn đã làm rất tốt.")
451
- elif overall_score >= 0.6:
452
- feedback.append("Phát âm khá tốt, còn một vài điểm cần cải thiện.")
453
- elif overall_score >= 0.4:
 
 
 
 
 
 
 
 
454
  feedback.append(
455
- "Cần luyện tập thêm. Tập trung vào những từ được đánh dấu đỏ."
456
  )
457
  else:
458
- feedback.append("Hãy luyện tập chậm và rõ ràng hơn.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
 
460
- # Wrong words feedback
461
- if wrong_words:
462
- word_names = [w["word"] for w in wrong_words[:3]]
463
- feedback.append(f"Các từ cần luyện tập: {', '.join(word_names)}")
 
464
 
465
  # Phoneme-specific feedback for Vietnamese speakers
466
- problem_phonemes = defaultdict(int)
467
- for comparison in phoneme_comparisons:
468
- if comparison["status"] == "wrong":
469
- phoneme = comparison["reference_phoneme"]
470
- problem_phonemes[phoneme] += 1
471
-
472
- # Vietnamese-specific tips for most problematic sounds
473
- vietnamese_tips = {
474
- "TH": "Đặt lưỡi giữa răng, thổi nhẹ",
475
- "DH": "Giống TH nhưng rung dây thanh",
476
- "V": "Chạm môi dưới vào răng trên",
477
- "R": "Cuộn lưỡi, không chạm vòm miệng",
478
- "L": "Đầu lưỡi chạm vòm miệng",
479
- "Z": "Giống S nhưng rung dây thanh",
480
- }
 
481
 
482
- if problem_phonemes:
483
- most_difficult = sorted(
484
- problem_phonemes.items(), key=lambda x: x[1], reverse=True
 
 
 
 
 
 
 
 
 
485
  )
486
- for phoneme, count in most_difficult[:2]:
487
- if phoneme in vietnamese_tips:
488
- feedback.append(f"Âm {phoneme}: {vietnamese_tips[phoneme]}")
489
 
490
  return feedback
491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  def _get_word_status(self, score: float) -> str:
493
  """Get word status from score"""
494
  if score >= 0.8:
@@ -500,215 +1155,474 @@ class SimplePronunciationAssessor:
500
  else:
501
  return "poor"
502
 
503
- def _get_word_color(self, score: float) -> str:
504
- """Get color for word highlighting"""
505
- if score >= 0.8:
506
- return "#22c55e" # Green
507
- elif score >= 0.6:
508
- return "#84cc16" # Light green
509
- elif score >= 0.4:
510
- return "#eab308" # Yellow
511
- else:
512
- return "#ef4444" # Red
513
-
514
- def _get_word_issues(self, word: str, phoneme_comparisons: List[Dict]) -> List[str]:
515
- """Get specific issues for a word"""
516
- issues = []
517
-
518
- word_comparisons = [c for c in phoneme_comparisons if c.get("word") == word]
519
-
520
- wrong_count = len([c for c in word_comparisons if c["status"] == "wrong"])
521
- missing_count = len([c for c in word_comparisons if c["status"] == "missing"])
522
-
523
- if wrong_count > 0:
524
- issues.append(f"{wrong_count} sai âm")
525
- if missing_count > 0:
526
- issues.append(f"{missing_count} thiếu âm")
527
-
528
- return issues
529
-
530
- def _get_pronunciation_tips(
531
- self, word: str, wrong_phonemes: List[Dict], missing_phonemes: List[str]
532
- ) -> List[str]:
533
- """Get pronunciation tips for wrong words"""
534
- tips = []
535
-
536
- # Tips for specific problematic phonemes
537
- phoneme_tips = {
538
- "TH": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ",
539
- "DH": "Giống TH nhưng rung dây thanh âm",
540
- "V": "Chạm môi dưới vào răng trên, không dùng cả hai môi",
541
- "R": "Cuộn lưỡi nhưng không chạm vào vòm miệng",
542
- "L": "Đầu lưỡi chạm vào vòm miệng sau răng",
543
- "Z": "Giống âm S nhưng có rung dây thanh âm",
544
- }
545
-
546
- # Add tips for wrong phonemes
547
- for wrong in wrong_phonemes:
548
- expected = wrong["expected"]
549
- if expected in phoneme_tips:
550
- tips.append(f"Âm {expected}: {phoneme_tips[expected]}")
551
-
552
- # Add tips for missing phonemes
553
- for missing in missing_phonemes:
554
- if missing in phoneme_tips:
555
- tips.append(f"Thiếu âm {missing}: {phoneme_tips[missing]}")
556
-
557
- # General tip if no specific tips
558
- if not tips:
559
- tips.append(f"Luyện tập từ '{word}' chậm và rõ ràng")
560
-
561
- return tips
562
 
563
 
564
  # =============================================================================
565
- # MAIN API ENDPOINT
566
  # =============================================================================
567
 
568
- # Initialize assessor
569
- assessor = SimplePronunciationAssessor()
570
 
 
 
571
 
572
- def convert_numpy_types(obj):
573
- """Convert numpy types to Python native types"""
574
- if isinstance(obj, np.integer):
575
- return int(obj)
576
- elif isinstance(obj, np.floating):
577
- return float(obj)
578
- elif isinstance(obj, np.ndarray):
579
- return obj.tolist()
580
- elif isinstance(obj, dict):
581
- return {key: convert_numpy_types(value) for key, value in obj.items()}
582
- elif isinstance(obj, list):
583
- return [convert_numpy_types(item) for item in obj]
584
- else:
585
- return obj
586
 
587
 
588
- @router.post("/assess", response_model=PronunciationAssessmentResult)
589
  async def assess_pronunciation(
590
- audio: UploadFile = File(..., description="Audio file (.wav, .mp3, .m4a)"),
591
- reference_text: str = Form(..., description="Reference text to compare against"),
 
592
  ):
593
  """
594
- Main API: Pronunciation Assessment
595
-
596
- Input: Audio file + Reference text
597
- Output: Word highlights + Phoneme differences + Wrong words
598
-
599
- Features:
600
- - Whisper ASR for transcript
601
- - CMU Dict phoneme mapping
602
- - Vietnamese-optimized comparison
603
- - Simple UI-ready output
604
  """
605
 
606
  import time
607
 
608
  start_time = time.time()
609
-
 
 
 
 
610
  # Validate inputs
611
  if not reference_text.strip():
 
612
  raise HTTPException(status_code=400, detail="Reference text cannot be empty")
613
 
614
- if len(reference_text) > 500:
 
615
  raise HTTPException(
616
- status_code=400, detail="Reference text too long (max 500 characters)"
617
  )
618
 
619
- # Check for valid English characters
 
620
  if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text):
 
 
621
  raise HTTPException(
622
  status_code=400,
623
- detail="Text must contain only English letters, spaces, and basic punctuation",
624
  )
625
 
626
  try:
627
- # Save uploaded file temporarily
 
 
628
  file_extension = ".wav"
629
- if audio.filename and "." in audio.filename:
630
- file_extension = f".{audio.filename.split('.')[-1]}"
631
-
632
  with tempfile.NamedTemporaryFile(
633
  delete=False, suffix=file_extension
634
  ) as tmp_file:
635
  content = await audio.read()
636
  tmp_file.write(content)
637
  tmp_file.flush()
 
 
638
 
639
- print(f"Processing audio file: {tmp_file.name}")
 
 
 
640
 
641
- # Run assessment
642
- result = assessor.assess_pronunciation(tmp_file.name, reference_text)
643
-
644
- # Clean up temporary file
645
  os.unlink(tmp_file.name)
646
 
647
- # Convert numpy types for JSON serialization
648
- final_result = convert_numpy_types(result)
 
 
 
 
 
 
 
 
649
 
650
  processing_time = time.time() - start_time
651
- print(f"Assessment completed in {processing_time:.2f} seconds")
652
-
653
- return PronunciationAssessmentResult(**final_result)
 
 
 
 
 
 
 
 
 
654
 
655
  except Exception as e:
656
- print(f"Assessment error: {str(e)}")
657
  import traceback
658
-
659
  traceback.print_exc()
660
- raise HTTPException(status_code=500, detail=f"Assessment failed: {str(e)}")
661
-
662
-
663
- # =============================================================================
664
- # UTILITY ENDPOINTS
665
- # =============================================================================
666
 
667
 
668
  @router.get("/phonemes/{word}")
669
  async def get_word_phonemes(word: str):
670
- """Get phoneme breakdown for a specific word"""
671
  try:
672
- phoneme_data = assessor.g2p.text_to_phonemes(word)[0]
673
 
674
- # Add difficulty analysis
675
- difficulty_scores = []
676
- for phoneme in phoneme_data["phonemes"]:
677
- difficulty = assessor.comparator.difficulty_map.get(phoneme, 0.3)
678
- difficulty_scores.append(difficulty)
679
 
680
- avg_difficulty = float(np.mean(difficulty_scores)) if difficulty_scores else 0.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
 
682
  return {
683
  "word": word,
684
- "phonemes": phoneme_data["phonemes"],
685
- "ipa": phoneme_data["ipa"],
686
- "difficulty_score": avg_difficulty,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
687
  "difficulty_level": (
688
  "hard"
689
- if avg_difficulty > 0.6
690
- else "medium" if avg_difficulty > 0.4 else "easy"
691
  ),
692
- "challenging_phonemes": [
693
- {
694
- "phoneme": p,
695
- "difficulty": assessor.comparator.difficulty_map.get(p, 0.3),
696
- }
697
- for p in phoneme_data["phonemes"]
698
- if assessor.comparator.difficulty_map.get(p, 0.3) > 0.6
699
- ],
700
  }
701
 
702
  except Exception as e:
703
- raise HTTPException(status_code=500, detail=f"Word analysis error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
 
706
- @router.get("/health")
707
- async def health_check():
708
- """Simple health check endpoint"""
709
  return {
710
- "status": "healthy",
711
- "whisper_model": "tiny",
712
- "cmu_dict_size": len(assessor.g2p.cmu_dict),
713
- "vietnamese_optimized": True,
714
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ENHANCED PRONUNCIATION API - MULTI-WORD SUPPORT
2
+ # Supports any English word using CMU Dict + phoneme libraries
3
 
4
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException, APIRouter
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from pydantic import BaseModel
7
+ from typing import List, Dict, Optional, Tuple
8
  import tempfile
9
  import os
10
  import numpy as np
11
+ import librosa
12
  import nltk
13
  import eng_to_ipa as ipa
14
+ import pronouncing
15
+ import requests
16
+ import json
17
  import re
18
  from collections import defaultdict
19
  import warnings
 
23
  # Download required NLTK data
24
  try:
25
  nltk.download("cmudict", quiet=True)
26
+ nltk.download("punkt", quiet=True)
27
  from nltk.corpus import cmudict
28
  except:
29
  print("Warning: NLTK data not available")
 
31
  # =============================================================================
32
  # MODELS
33
  # =============================================================================
34
+ router = APIRouter(prefix="/speaking", tags=["AI"])
35
 
 
36
 
37
+ class PronunciationResult(BaseModel):
 
 
38
  overall_score: float
39
+ status: str
 
 
40
  feedback: List[str]
41
+ words: List[Dict]
42
+ phoneme_details: List[Dict]
43
+ audio_info: Dict
44
+ processing_time: float
45
+ difficulty_analysis: Dict
46
+
47
+
48
+ class WordPhonemeInfo(BaseModel):
49
+ word: str
50
+ phonemes: List[str]
51
+ ipa_transcription: str
52
+ syllables: List[str]
53
+ stress_pattern: List[int]
54
 
55
 
56
  # =============================================================================
57
+ # ENHANCED PHONEME PROCESSOR
58
  # =============================================================================
59
 
60
 
61
+ class EnhancedPhonemeProcessor:
62
+ """Advanced phoneme processing with multiple dictionaries"""
63
 
64
  def __init__(self):
65
+ self.sample_rate = 16000
66
+
67
+ # Load CMU dictionary
68
  try:
69
  self.cmu_dict = cmudict.dict()
70
  except:
71
  self.cmu_dict = {}
72
  print("Warning: CMU dictionary not available")
73
 
74
+ # Load comprehensive phoneme acoustic models
75
+ self.phoneme_models = self._load_comprehensive_phoneme_models()
 
 
 
 
 
 
 
 
 
 
76
 
77
+ # Phoneme difficulty for Vietnamese speakers
78
+ self.difficulty_map = {
79
+ # Very difficult for Vietnamese
80
+ "TH": 0.9, # think, that
81
+ "DH": 0.9, # this, then
82
+ "V": 0.8, # very, love
83
+ "Z": 0.8, # zoo, rise
84
+ "ZH": 0.9, # measure, vision
85
+ "R": 0.7, # red, car
86
+ "L": 0.6, # love, well
87
+ "W": 0.5, # water, well
88
+ # Moderately difficult
89
+ "F": 0.4, # fish, life
90
+ "S": 0.3, # see, this
91
+ "SH": 0.5, # shoe, fish
92
+ "CH": 0.4, # chair, much
93
+ "JH": 0.5, # job, bridge
94
+ # Vowels - challenging distinctions
95
+ "IY": 0.3, # beat
96
+ "IH": 0.6, # bit
97
+ "EY": 0.4, # bait
98
+ "EH": 0.5, # bet
99
+ "AE": 0.7, # bat
100
+ "AH": 0.4, # but
101
+ "AO": 0.6, # bought
102
+ "OW": 0.4, # boat
103
+ "UH": 0.6, # book
104
+ "UW": 0.4, # boot
105
+ # Easier sounds
106
+ "P": 0.2,
107
+ "B": 0.2,
108
+ "T": 0.2,
109
+ "D": 0.2,
110
+ "K": 0.2,
111
+ "G": 0.2,
112
+ "M": 0.2,
113
+ "N": 0.2,
114
+ "NG": 0.3,
115
+ }
116
 
117
+ def get_word_phonemes(self, word: str) -> WordPhonemeInfo:
118
+ """Get comprehensive phoneme info for any English word"""
119
+ word_lower = word.lower().strip()
120
 
121
+ # Method 1: CMU Dictionary (most reliable)
122
+ cmu_phonemes = []
123
  if word_lower in self.cmu_dict:
124
+ # Get first pronunciation variant
125
+ cmu_phonemes = self.cmu_dict[word_lower][0]
126
+ # Remove stress markers (0,1,2) from vowels
127
+ cmu_phonemes = [re.sub(r"[0-9]", "", p) for p in cmu_phonemes]
128
+
129
+ # Method 2: eng_to_ipa library
130
+ ipa_transcription = ""
131
+ try:
132
+ ipa_transcription = ipa.convert(word)
133
+ except:
134
+ ipa_transcription = f"/{word}/"
135
 
136
+ # Method 3: pronouncing library for syllables
137
+ syllables = []
138
  try:
139
+ syllable_count = pronouncing.syllable_count(word)
140
+ # Simple syllable division
141
+ if syllable_count and len(word) > syllable_count:
142
+ syllable_length = len(word) // syllable_count
143
+ syllables = [
144
+ word[i : i + syllable_length]
145
+ for i in range(0, len(word), syllable_length)
146
+ ]
147
+ else:
148
+ syllables = [word]
149
  except:
150
+ syllables = [word]
151
+
152
+ # Extract stress pattern from CMU
153
+ stress_pattern = []
154
+ if word_lower in self.cmu_dict:
155
+ for phoneme in self.cmu_dict[word_lower][0]:
156
+ stress = re.findall(r"[0-9]", phoneme)
157
+ if stress:
158
+ stress_pattern.append(int(stress[0]))
159
+
160
+ # Fallback phonemes if CMU not available
161
+ if not cmu_phonemes:
162
+ cmu_phonemes = self._estimate_phonemes(word)
163
+
164
+ return WordPhonemeInfo(
165
+ word=word,
166
+ phonemes=cmu_phonemes,
167
+ ipa_transcription=ipa_transcription,
168
+ syllables=syllables,
169
+ stress_pattern=stress_pattern,
170
+ )
171
 
172
  def _estimate_phonemes(self, word: str) -> List[str]:
173
  """Estimate phonemes for unknown words"""
174
+ # Simple grapheme-to-phoneme mapping
175
  phoneme_map = {
176
  "ch": ["CH"],
177
  "sh": ["SH"],
 
213
 
214
  while i < len(word):
215
  # Check 2-letter combinations first
216
+ if i < len(word) - 1:
217
  two_char = word[i : i + 2]
218
  if two_char in phoneme_map:
219
  phonemes.extend(phoneme_map[two_char])
 
229
 
230
  return phonemes
231
 
232
+ def _load_comprehensive_phoneme_models(self) -> Dict:
233
+ """Load comprehensive phoneme acoustic models"""
234
+ # Extended phoneme set với acoustic characteristics
235
+ models = {}
236
+
237
+ # VOWELS
238
+ vowel_models = {
239
+ "IY": {"f1": 270, "f2": 2300, "duration": 150, "type": "vowel"}, # beat
240
+ "IH": {"f1": 390, "f2": 1990, "duration": 120, "type": "vowel"}, # bit
241
+ "EY": {"f1": 400, "f2": 2100, "duration": 160, "type": "vowel"}, # bait
242
+ "EH": {"f1": 550, "f2": 1770, "duration": 130, "type": "vowel"}, # bet
243
+ "AE": {"f1": 690, "f2": 1660, "duration": 140, "type": "vowel"}, # bat
244
+ "AH": {"f1": 640, "f2": 1190, "duration": 110, "type": "vowel"}, # but
245
+ "AO": {"f1": 570, "f2": 840, "duration": 150, "type": "vowel"}, # bought
246
+ "OW": {"f1": 430, "f2": 1020, "duration": 160, "type": "vowel"}, # boat
247
+ "UH": {"f1": 450, "f2": 1030, "duration": 120, "type": "vowel"}, # book
248
+ "UW": {"f1": 310, "f2": 870, "duration": 150, "type": "vowel"}, # boot
249
+ "ER": {"f1": 490, "f2": 1350, "duration": 140, "type": "vowel"}, # bird
250
+ "AY": {"f1": 640, "f2": 1190, "duration": 180, "type": "vowel"}, # bite
251
+ "AW": {"f1": 640, "f2": 1190, "duration": 180, "type": "vowel"}, # bout
252
+ "OY": {"f1": 570, "f2": 840, "duration": 180, "type": "vowel"}, # boy
253
  }
254
 
255
+ # CONSONANTS
256
+ consonant_models = {
257
+ # Stops
258
+ "P": {
259
+ "burst_energy": 0.8,
260
+ "duration": 80,
261
+ "type": "stop",
262
+ "voicing": False,
263
+ },
264
+ "B": {"burst_energy": 0.7, "duration": 85, "type": "stop", "voicing": True},
265
+ "T": {
266
+ "burst_energy": 0.9,
267
+ "duration": 75,
268
+ "type": "stop",
269
+ "voicing": False,
270
+ },
271
+ "D": {
272
+ "burst_energy": 0.75,
273
+ "duration": 80,
274
+ "type": "stop",
275
+ "voicing": True,
276
+ },
277
+ "K": {
278
+ "burst_energy": 0.85,
279
+ "duration": 70,
280
+ "type": "stop",
281
+ "voicing": False,
282
+ },
283
+ "G": {"burst_energy": 0.7, "duration": 75, "type": "stop", "voicing": True},
284
+ # Fricatives (challenging for Vietnamese)
285
+ "F": {
286
+ "high_freq": True,
287
+ "duration": 120,
288
+ "type": "fricative",
289
+ "voicing": False,
290
+ },
291
+ "V": {
292
+ "high_freq": True,
293
+ "duration": 110,
294
+ "type": "fricative",
295
+ "voicing": True,
296
+ },
297
+ "TH": {
298
+ "high_freq": True,
299
+ "duration": 130,
300
+ "type": "fricative",
301
+ "voicing": False,
302
+ }, # think
303
+ "DH": {
304
+ "high_freq": True,
305
+ "duration": 120,
306
+ "type": "fricative",
307
+ "voicing": True,
308
+ }, # this
309
+ "S": {
310
+ "very_high_freq": True,
311
+ "duration": 140,
312
+ "type": "fricative",
313
+ "voicing": False,
314
+ },
315
+ "Z": {
316
+ "very_high_freq": True,
317
+ "duration": 130,
318
+ "type": "fricative",
319
+ "voicing": True,
320
+ },
321
+ "SH": {
322
+ "high_freq": True,
323
+ "duration": 150,
324
+ "type": "fricative",
325
+ "voicing": False,
326
+ }, # shoe
327
+ "ZH": {
328
+ "high_freq": True,
329
+ "duration": 140,
330
+ "type": "fricative",
331
+ "voicing": True,
332
+ }, # measure
333
+ "HH": {
334
+ "breathy": True,
335
+ "duration": 100,
336
+ "type": "fricative",
337
+ "voicing": False,
338
+ }, # hello
339
+ # Affricates
340
+ "CH": {
341
+ "burst_fricative": True,
342
+ "duration": 160,
343
+ "type": "affricate",
344
+ "voicing": False,
345
+ }, # chair
346
+ "JH": {
347
+ "burst_fricative": True,
348
+ "duration": 150,
349
+ "type": "affricate",
350
+ "voicing": True,
351
+ }, # job
352
+ # Nasals
353
+ "M": {"nasal": True, "duration": 100, "type": "nasal", "voicing": True},
354
+ "N": {"nasal": True, "duration": 95, "type": "nasal", "voicing": True},
355
+ "NG": {
356
+ "nasal": True,
357
+ "duration": 105,
358
+ "type": "nasal",
359
+ "voicing": True,
360
+ }, # ring
361
+ # Liquids (challenging L/R distinction)
362
+ "L": {"lateral": True, "duration": 90, "type": "liquid", "voicing": True},
363
+ "R": {"retroflex": True, "duration": 95, "type": "liquid", "voicing": True},
364
+ # Glides
365
+ "Y": {"glide": True, "duration": 70, "type": "glide", "voicing": True},
366
+ "W": {"glide": True, "duration": 75, "type": "glide", "voicing": True},
367
  }
368
 
369
+ # Combine models
370
+ models.update(vowel_models)
371
+ models.update(consonant_models)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
373
+ return models
 
 
 
 
 
 
 
 
 
 
374
 
375
+ def get_difficulty_score(self, phonemes: List[str]) -> float:
376
+ """Calculate difficulty score for Vietnamese speakers"""
377
+ if not phonemes:
378
+ return 0.5
 
 
 
 
 
 
 
 
 
379
 
380
+ difficulties = []
381
+ for phoneme in phonemes:
382
+ # Remove stress markers
383
+ clean_phoneme = re.sub(r"[0-9]", "", phoneme)
384
+ difficulty = self.difficulty_map.get(clean_phoneme, 0.3)
385
+ difficulties.append(difficulty)
386
+
387
+ return np.mean(difficulties)
388
+
389
+ def score_phoneme_advanced(
390
+ self, phoneme: str, segment_features: Dict, context: Dict = None
391
+ ) -> float:
392
+ """Advanced phoneme scoring với context"""
393
+ clean_phoneme = re.sub(r"[0-9]", "", phoneme)
394
+
395
+ if clean_phoneme not in self.phoneme_models:
396
+ return 0.5 # Unknown phoneme
397
+
398
+ model = self.phoneme_models[clean_phoneme]
399
+ score = 0.0
400
+
401
+ # Type-specific scoring
402
+ if model["type"] == "vowel":
403
+ score = self._score_vowel(clean_phoneme, segment_features, model)
404
+ elif model["type"] == "fricative":
405
+ score = self._score_fricative(clean_phoneme, segment_features, model)
406
+ elif model["type"] == "stop":
407
+ score = self._score_stop(clean_phoneme, segment_features, model)
408
+ elif model["type"] in ["liquid", "nasal", "glide", "affricate"]:
409
+ score = self._score_other_consonant(clean_phoneme, segment_features, model)
410
+
411
+ # Context adjustments
412
+ if context:
413
+ score = self._apply_context_adjustments(score, clean_phoneme, context)
414
+
415
+ # Difficulty adjustment for Vietnamese speakers
416
+ difficulty = self.difficulty_map.get(clean_phoneme, 0.3)
417
+ # Easier scoring for more difficult phonemes
418
+ adjusted_score = score + (difficulty * 0.1)
419
+
420
+ return np.clip(adjusted_score, 0, 1)
421
+
422
+ def _score_vowel(self, phoneme: str, features: Dict, model: Dict) -> float:
423
+ """Score vowel phoneme"""
424
+ score = 0.0
425
+
426
+ # Energy check (vowels should have good energy)
427
+ if features.get("rms_mean", 0) > 0.01:
428
+ score += 0.3
429
+
430
+ # Spectral characteristics
431
+ centroid = features.get("spectral_centroid_mean", 0)
432
+ target_f2 = model.get("f2", 1500)
433
+
434
+ # F2 approximation from spectral centroid
435
+ f2_error = abs(centroid - target_f2) / target_f2
436
+ f2_score = max(0, 1 - f2_error)
437
+ score += 0.4 * f2_score
438
+
439
+ # Stability (vowels should be stable)
440
+ zcr = features.get("zcr_mean", 0)
441
+ if zcr < 0.1: # Low zero crossing for vowels
442
+ score += 0.3
443
+
444
+ return score
445
+
446
+ def _score_fricative(self, phoneme: str, features: Dict, model: Dict) -> float:
447
+ """Score fricative phoneme"""
448
+ score = 0.0
449
+
450
+ # High frequency content for fricatives
451
+ centroid = features.get("spectral_centroid_mean", 0)
452
+ zcr = features.get("zcr_mean", 0)
453
+
454
+ if model.get("very_high_freq"): # S, Z sounds
455
+ if centroid > 3000:
456
+ score += 0.4
457
+ if zcr > 0.2:
458
+ score += 0.4
459
+ elif model.get("high_freq"): # F, V, TH, DH, SH, ZH
460
+ if centroid > 1500:
461
+ score += 0.4
462
+ if zcr > 0.15:
463
+ score += 0.3
464
+
465
+ # Voicing check
466
+ energy = features.get("rms_mean", 0)
467
+ if model.get("voicing") and energy > 0.01: # Voiced fricatives
468
+ score += 0.2
469
+ elif not model.get("voicing") and energy < 0.05: # Voiceless fricatives
470
+ score += 0.2
471
+
472
+ return score
473
+
474
+ def _score_stop(self, phoneme: str, features: Dict, model: Dict) -> float:
475
+ """Score stop consonant"""
476
+ score = 0.0
477
+
478
+ # Burst energy
479
+ energy = features.get("rms_mean", 0)
480
+ burst_threshold = 0.02 if model.get("voicing") else 0.03
481
+
482
+ if energy > burst_threshold:
483
+ score += 0.6
484
+
485
+ # Duration check
486
+ # Stops should be relatively short
487
+ score += 0.4 # Base score for presence
488
+
489
+ return score
490
+
491
+ def _score_other_consonant(
492
+ self, phoneme: str, features: Dict, model: Dict
493
+ ) -> float:
494
+ """Score other consonant types"""
495
+ score = 0.0
496
+
497
+ energy = features.get("rms_mean", 0)
498
+ centroid = features.get("spectral_centroid_mean", 0)
499
+ zcr = features.get("zcr_mean", 0)
500
+
501
+ if model["type"] == "liquid":
502
+ # L/R sounds - moderate energy, specific spectral characteristics
503
+ if 0.01 <= energy <= 0.08:
504
+ score += 0.3
505
+ if phoneme == "R" and centroid < 1800: # R lowers F3
506
+ score += 0.4
507
+ elif phoneme == "L" and 1200 <= centroid <= 2200:
508
+ score += 0.4
509
+ score += 0.3 # Base score
510
+
511
+ elif model["type"] == "nasal":
512
+ # Nasal sounds - good energy, specific spectral pattern
513
+ if energy > 0.005:
514
+ score += 0.4
515
+ if 800 <= centroid <= 2000:
516
+ score += 0.3
517
+ score += 0.3
518
+
519
+ elif model["type"] == "glide":
520
+ # W/Y sounds - transition characteristics
521
+ if energy > 0.005:
522
+ score += 0.5
523
+ score += 0.5
524
+
525
+ elif model["type"] == "affricate":
526
+ # CH/JH - combination of stop + fricative
527
+ if energy > 0.02: # Burst component
528
+ score += 0.3
529
+ if zcr > 0.1: # Fricative component
530
+ score += 0.4
531
+ score += 0.3
532
+
533
+ return score
534
+
535
+ def _apply_context_adjustments(
536
+ self, score: float, phoneme: str, context: Dict
537
+ ) -> float:
538
+ """Apply contextual adjustments"""
539
+ # Position in word adjustments
540
+ position = context.get("position", "middle")
541
+
542
+ if position == "initial" and phoneme in ["TH", "DH"]:
543
+ score *= 1.1 # Easier in initial position
544
+ elif position == "final" and phoneme in ["T", "D", "K", "G"]:
545
+ score *= 0.9 # Harder in final position (Vietnamese tendency to drop)
546
+
547
+ # Surrounding phonemes
548
+ prev_phoneme = context.get("prev_phoneme")
549
+ next_phoneme = context.get("next_phoneme")
550
+
551
+ # Consonant clusters (difficult for Vietnamese)
552
+ if (
553
+ prev_phoneme
554
+ and prev_phoneme in ["S", "T", "K"]
555
+ and phoneme in ["T", "K", "P"]
556
+ ):
557
+ score *= 0.8 # Consonant clusters are harder
558
+
559
+ return score
560
 
 
561
 
562
+ # =============================================================================
563
+ # ENHANCED PRONUNCIATION ASSESSOR
564
+ # =============================================================================
 
565
 
566
 
567
+ class EnhancedPronunciationAssessor:
568
+ """Enhanced assessor supporting any English word"""
569
 
570
  def __init__(self):
571
+ self.phoneme_processor = EnhancedPhonemeProcessor()
 
 
 
 
 
572
  self.sample_rate = 16000
573
 
574
+ def process_audio_file(self, file_path: str, reference_text: str) -> Dict:
575
+ """Process audio file with enhanced phoneme analysis"""
576
+
577
+ # Load and validate audio
578
+ audio, sr = librosa.load(file_path, sr=self.sample_rate)
579
+ duration = len(audio) / sr
580
+ max_amplitude = np.max(np.abs(audio))
581
 
582
+ # Audio quality analysis
583
+ audio_info = self._analyze_audio_quality(audio, duration, max_amplitude)
 
 
 
584
 
585
+ # Extract comprehensive features
586
+ features = self._extract_comprehensive_features(audio)
 
587
 
588
+ # Text analysis
589
+ text_analysis = self._analyze_text(reference_text)
 
590
 
591
+ # Pronunciation assessment
592
+ pronunciation_analysis = self._assess_pronunciation(
593
+ audio, features, reference_text, text_analysis
 
594
  )
595
 
596
+ return {
597
+ "audio_info": audio_info,
598
+ "text_analysis": text_analysis,
599
+ "pronunciation_analysis": pronunciation_analysis,
600
+ "features": features,
601
+ }
602
+
603
+ def _analyze_audio_quality(
604
+ self, audio: np.ndarray, duration: float, max_amplitude: float
605
+ ) -> Dict:
606
+ """Comprehensive audio quality analysis"""
607
+ issues = []
608
+ quality_score = 1.0
609
+
610
+ # Duration checks
611
+ if duration < 0.5:
612
+ issues.append("too_short")
613
+ quality_score *= 0.5
614
+ elif duration > 30:
615
+ issues.append("too_long")
616
+ quality_score *= 0.8
617
+
618
+ # Amplitude checks
619
+ if max_amplitude < 0.005:
620
+ issues.append("too_quiet")
621
+ quality_score *= 0.6
622
+ elif max_amplitude > 0.98:
623
+ issues.append("clipped")
624
+ quality_score *= 0.7
625
+
626
+ # Noise analysis
627
+ noise_floor = np.mean(np.abs(audio[: int(0.1 * len(audio))])) # First 100ms
628
+ if noise_floor > 0.02:
629
+ issues.append("noisy")
630
+ quality_score *= 0.8
631
+
632
+ # Signal-to-noise ratio
633
+ signal_power = np.mean(audio**2)
634
+ snr = 10 * np.log10(signal_power / (noise_floor**2 + 1e-10))
635
+
636
+ return {
637
+ "duration": duration,
638
+ "max_amplitude": max_amplitude,
639
+ "noise_floor": noise_floor,
640
+ "snr": snr,
641
+ "quality_score": quality_score,
642
+ "issues": issues,
643
+ "quality_status": "good" if not issues else ",".join(issues),
644
+ }
645
+
646
+ def _extract_comprehensive_features(self, audio: np.ndarray) -> Dict:
647
+ """Extract comprehensive acoustic features"""
648
+ features = {}
649
+
650
+ # Basic features
651
+ features["mfcc"] = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13)
652
+ features["mfcc_mean"] = np.mean(features["mfcc"], axis=1).tolist()
653
+
654
+ # Energy features
655
+ rms = librosa.feature.rms(y=audio, hop_length=512)[0]
656
+ features["rms"] = rms.tolist()
657
+ features["rms_mean"] = float(np.mean(rms))
658
+ features["rms_std"] = float(np.std(rms))
659
+
660
+ # Spectral features
661
+ spectral_centroid = librosa.feature.spectral_centroid(
662
+ y=audio, sr=self.sample_rate
663
+ )[0]
664
+ features["spectral_centroid"] = spectral_centroid.tolist()
665
+ features["spectral_centroid_mean"] = float(np.mean(spectral_centroid))
666
+ features["spectral_centroid_std"] = float(np.std(spectral_centroid))
667
+
668
+ # Additional spectral features
669
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(
670
+ y=audio, sr=self.sample_rate
671
+ )[0]
672
+ features["spectral_bandwidth_mean"] = float(np.mean(spectral_bandwidth))
673
+
674
+ spectral_rolloff = librosa.feature.spectral_rolloff(
675
+ y=audio, sr=self.sample_rate
676
+ )[0]
677
+ features["spectral_rolloff_mean"] = float(np.mean(spectral_rolloff))
678
+
679
+ # Zero crossing rate
680
+ zcr = librosa.feature.zero_crossing_rate(audio, hop_length=512)[0]
681
+ features["zcr"] = zcr.tolist()
682
+ features["zcr_mean"] = float(np.mean(zcr))
683
+ features["zcr_std"] = float(np.std(zcr))
684
+
685
+ # Pitch analysis
686
+ pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate)
687
+ f0 = []
688
+ for t in range(pitches.shape[1]):
689
+ index = magnitudes[:, t].argmax()
690
+ pitch = pitches[index, t]
691
+ f0.append(
692
+ float(pitch) if pitch > 80 else 0.0
693
+ ) # Filter out very low frequencies
694
+
695
+ features["f0"] = f0
696
+ valid_f0 = [f for f in f0 if f > 0]
697
+ features["f0_mean"] = float(np.mean(valid_f0)) if valid_f0 else 0.0
698
+ features["f0_std"] = float(np.std(valid_f0)) if valid_f0 else 0.0
699
+
700
+ # Formant estimation (simplified)
701
+ features["formants"] = self._estimate_formants(audio)
702
+
703
+ return features
704
+
705
+
706
+
707
+ def _analyze_text(self, text: str) -> Dict:
708
+ """Analyze reference text for phonemes and difficulty"""
709
+ words = text.lower().strip().split()
710
+ text_info = {
711
+ "words": [],
712
+ "total_phonemes": 0,
713
+ "difficulty_score": 0,
714
+ "challenging_sounds": [],
715
+ }
716
+
717
+ all_phonemes = []
718
+
719
+ for word in words:
720
+ word_info = self.phoneme_processor.get_word_phonemes(word)
721
+
722
+ # Calculate word difficulty
723
+ word_difficulty = self.phoneme_processor.get_difficulty_score(
724
+ word_info.phonemes
725
+ )
726
+
727
+ # Find challenging phonemes
728
+ challenging = []
729
+ for phoneme in word_info.phonemes:
730
+ clean_phoneme = re.sub(r"[0-9]", "", phoneme)
731
+ difficulty = self.phoneme_processor.difficulty_map.get(clean_phoneme, 0)
732
+ if difficulty > 0.6:
733
+ challenging.append(clean_phoneme)
734
+
735
+ word_data = {
736
+ "word": word,
737
+ "phonemes": word_info.phonemes,
738
+ "ipa": word_info.ipa_transcription,
739
+ "syllables": word_info.syllables,
740
+ "difficulty": word_difficulty,
741
+ "challenging_phonemes": challenging,
742
+ }
743
+
744
+ text_info["words"].append(word_data)
745
+ all_phonemes.extend(word_info.phonemes)
746
+ text_info["challenging_sounds"].extend(challenging)
747
+
748
+ text_info["total_phonemes"] = len(all_phonemes)
749
+ text_info["difficulty_score"] = self.phoneme_processor.get_difficulty_score(
750
+ all_phonemes
751
  )
752
+ text_info["challenging_sounds"] = list(
753
+ set(text_info["challenging_sounds"])
754
+ ) # Remove duplicates
755
+
756
+ return text_info
757
+
758
+ def _assess_pronunciation(
759
+ self, audio: np.ndarray, features: Dict, text: str, text_analysis: Dict
760
+ ) -> Dict:
761
+ """Comprehensive pronunciation assessment"""
762
+ words = text.lower().strip().split()
763
+ word_segments = self._segment_words_advanced(audio, features, len(words))
764
+
765
+ word_results = []
766
+ phoneme_results = []
767
+
768
+ for i, word in enumerate(words):
769
+ if i < len(word_segments):
770
+ word_audio = word_segments[i]
771
+ word_info = text_analysis["words"][i]
772
+
773
+ # Assess word
774
+ word_result = self._assess_word_comprehensive(
775
+ word_audio, word_info, features, i, len(words)
776
+ )
777
+
778
+ word_results.append(word_result)
779
+ phoneme_results.extend(word_result["phoneme_details"])
780
 
781
+ # Calculate overall metrics
782
+ overall_score = (
783
+ np.mean([wr["score"] for wr in word_results]) if word_results else 0.0
784
+ )
785
 
786
+ # Generate comprehensive feedback
787
+ feedback = self._generate_comprehensive_feedback(
788
+ word_results, text_analysis, features, overall_score
789
+ )
790
 
791
+ # Difficulty analysis
792
+ difficulty_analysis = self._analyze_difficulty_performance(
793
+ word_results, text_analysis
794
  )
795
 
796
  return {
 
797
  "overall_score": overall_score,
798
+ "words": word_results,
799
+ "phoneme_details": phoneme_results,
 
800
  "feedback": feedback,
801
+ "status": self._get_status(overall_score),
802
+ "difficulty_analysis": difficulty_analysis,
803
  }
804
 
805
+ def _segment_words_advanced(
806
+ self, audio: np.ndarray, features: Dict, num_words: int
807
+ ) -> List[np.ndarray]:
808
+ """Advanced word segmentation using energy and spectral cues"""
809
+ if num_words == 1:
810
+ return [audio]
811
+
812
+ # Use RMS energy to find word boundaries
813
+ rms = features["rms"]
814
+
815
+ # Find energy peaks (potential word centers)
816
+ from scipy.signal import find_peaks
817
+
818
+ # Smooth RMS for better peak detection
819
+ window_size = min(5, len(rms) // 4)
820
+ if window_size > 0:
821
+ rms_smooth = np.convolve(
822
+ rms, np.ones(window_size) / window_size, mode="same"
823
+ )
824
+ else:
825
+ rms_smooth = rms
826
+
827
+ peaks, _ = find_peaks(
828
+ rms_smooth,
829
+ height=np.mean(rms_smooth) * 0.5,
830
+ distance=len(rms) // (num_words * 2),
831
+ )
832
+
833
+ # If we don't find enough peaks, fall back to equal division
834
+ if len(peaks) < num_words:
835
+ segment_length = len(audio) // num_words
836
+ segments = []
837
+ for i in range(num_words):
838
+ start = i * segment_length
839
+ end = start + segment_length if i < num_words - 1 else len(audio)
840
+ segments.append(audio[start:end])
841
+ return segments
842
+
843
+ # Use peaks to define word boundaries
844
+ hop_length = 512
845
+ peak_times = librosa.frames_to_samples(peaks, hop_length=hop_length)
846
+
847
+ segments = []
848
+ for i in range(num_words):
849
+ if i == 0:
850
+ start = 0
851
+ end = peak_times[min(i, len(peak_times) - 1)] + len(audio) // (
852
+ num_words * 4
853
+ )
854
+ elif i == num_words - 1:
855
+ start = peak_times[min(i - 1, len(peak_times) - 1)] - len(audio) // (
856
+ num_words * 4
857
+ )
858
+ end = len(audio)
859
+ else:
860
+ start = peak_times[min(i - 1, len(peak_times) - 1)] - len(audio) // (
861
+ num_words * 6
862
+ )
863
+ end = peak_times[min(i, len(peak_times) - 1)] + len(audio) // (
864
+ num_words * 6
865
+ )
866
+
867
+ start = max(0, start)
868
+ end = min(len(audio), end)
869
+ segments.append(audio[start:end])
870
+
871
+ return segments
872
+
873
+ def _assess_word_comprehensive(
874
  self,
875
+ word_audio: np.ndarray,
876
+ word_info: Dict,
877
+ global_features: Dict,
878
+ word_index: int,
879
+ total_words: int,
880
+ ) -> Dict:
881
+ """Comprehensive word assessment"""
882
+ if len(word_audio) < 500:
883
+ return {
884
+ "word": word_info["word"],
885
+ "score": 0.2,
886
+ "status": "poor",
887
+ "issues": ["too_short"],
888
+ "phoneme_details": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889
  }
890
 
891
+ # Extract word-level features
892
+ word_features = self._extract_word_features(word_audio)
893
+
894
+ # Assess each phoneme
895
+ phonemes = word_info["phonemes"]
896
+ phoneme_segments = self._segment_phonemes(word_audio, len(phonemes))
897
+
898
+ phoneme_scores = []
899
+ phoneme_details = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
900
 
901
+ for i, (phoneme, segment) in enumerate(zip(phonemes, phoneme_segments)):
902
+ if len(segment) > 100: # Minimum segment length
903
+ segment_features = self._extract_segment_features(segment)
904
+
905
+ # Context information
906
+ context = {
907
+ "position": (
908
+ "initial"
909
+ if i == 0
910
+ else "final" if i == len(phonemes) - 1 else "middle"
 
 
 
911
  ),
912
+ "prev_phoneme": phonemes[i - 1] if i > 0 else None,
913
+ "next_phoneme": phonemes[i + 1] if i < len(phonemes) - 1 else None,
914
+ "word_position": word_index / total_words,
915
  }
916
 
917
+ score = self.phoneme_processor.score_phoneme_advanced(
918
+ phoneme, segment_features, context
919
+ )
920
+
921
+ phoneme_scores.append(score)
922
+ phoneme_details.append(
923
+ {
924
+ "phoneme": phoneme,
925
+ "score": score,
926
+ "position": context["position"],
927
+ "difficulty": self.phoneme_processor.difficulty_map.get(
928
+ re.sub(r"[0-9]", "", phoneme), 0.3
929
+ ),
930
+ "word": word_info["word"],
931
+ }
932
+ )
933
+
934
+ # Word-level score
935
+ word_score = np.mean(phoneme_scores) if phoneme_scores else 0.0
936
+
937
+ # Detect issues
938
+ issues = []
939
+ if word_score < 0.3:
940
+ issues.append("very_poor_clarity")
941
+ if word_features.get("rms_mean", 0) < 0.005:
942
+ issues.append("too_quiet")
943
+ if word_features.get("zcr_mean", 0) > 0.3:
944
+ issues.append("too_noisy")
945
+
946
+ return {
947
+ "word": word_info["word"],
948
+ "score": word_score,
949
+ "status": self._get_word_status(word_score),
950
+ "phonemes": phonemes,
951
+ "phoneme_scores": phoneme_scores,
952
+ "phoneme_details": phoneme_details,
953
+ "ipa": word_info["ipa"],
954
+ "syllables": word_info["syllables"],
955
+ "difficulty": word_info["difficulty"],
956
+ "issues": issues,
957
+ }
958
+
959
+ def _extract_word_features(self, word_audio: np.ndarray) -> Dict:
960
+ """Extract features for word segment"""
961
+ if len(word_audio) < 100:
962
+ return {}
963
+
964
+ mfcc = librosa.feature.mfcc(y=word_audio, sr=self.sample_rate, n_mfcc=13)
965
+ rms = librosa.feature.rms(y=word_audio)[0]
966
+ centroid = librosa.feature.spectral_centroid(y=word_audio, sr=self.sample_rate)[
967
+ 0
968
+ ]
969
+ zcr = librosa.feature.zero_crossing_rate(word_audio)[0]
970
+
971
+ return {
972
+ "mfcc_mean": np.mean(mfcc, axis=1).tolist(),
973
+ "rms_mean": float(np.mean(rms)),
974
+ "spectral_centroid_mean": float(np.mean(centroid)),
975
+ "zcr_mean": float(np.mean(zcr)),
976
+ }
977
+
978
+ def _segment_phonemes(
979
+ self, word_audio: np.ndarray, num_phonemes: int
980
+ ) -> List[np.ndarray]:
981
+ """Segment word audio into phonemes"""
982
+ if num_phonemes <= 1:
983
+ return [word_audio]
984
+
985
+ segment_length = len(word_audio) // num_phonemes
986
+ segments = []
987
+
988
+ for i in range(num_phonemes):
989
+ start = i * segment_length
990
+ end = start + segment_length if i < num_phonemes - 1 else len(word_audio)
991
+ segments.append(word_audio[start:end])
992
 
993
+ return segments
994
 
995
+ def _extract_segment_features(self, segment: np.ndarray) -> Dict:
996
+ """Extract features for phoneme segment"""
997
+ if len(segment) < 50:
998
+ return {}
999
 
1000
+ # Basic features for short segments
1001
+ rms_mean = float(np.mean(librosa.feature.rms(y=segment)[0]))
1002
+ zcr_mean = float(np.mean(librosa.feature.zero_crossing_rate(segment)[0]))
1003
 
1004
+ # Spectral centroid
1005
+ centroid = librosa.feature.spectral_centroid(y=segment, sr=self.sample_rate)[0]
1006
+ centroid_mean = float(np.mean(centroid))
1007
 
1008
+ # MFCC for short segment
1009
+ if len(segment) > 512:
1010
+ mfcc = librosa.feature.mfcc(y=segment, sr=self.sample_rate, n_mfcc=5)
1011
+ mfcc_mean = np.mean(mfcc, axis=1).tolist()
1012
+ else:
1013
+ mfcc_mean = [0] * 5
1014
+
1015
+ return {
1016
+ "rms_mean": rms_mean,
1017
+ "zcr_mean": zcr_mean,
1018
+ "spectral_centroid_mean": centroid_mean,
1019
+ "mfcc_mean": mfcc_mean,
1020
+ }
1021
+
1022
+ def _generate_comprehensive_feedback(
1023
  self,
1024
+ word_results: List[Dict],
1025
+ text_analysis: Dict,
1026
+ features: Dict,
1027
  overall_score: float,
 
 
1028
  ) -> List[str]:
1029
+ """Generate comprehensive feedback"""
 
1030
  feedback = []
1031
 
1032
+ # Overall performance feedback
1033
+ if overall_score >= 0.85:
1034
+ feedback.append(
1035
+ "🎉 Outstanding pronunciation! You sound very natural and clear."
1036
+ )
1037
+ elif overall_score >= 0.7:
1038
+ feedback.append(
1039
+ "👍 Great job! Your pronunciation is quite good with room for minor improvements."
1040
+ )
1041
+ elif overall_score >= 0.5:
1042
+ feedback.append(
1043
+ "📚 Good progress! Keep practicing the areas highlighted below."
1044
+ )
1045
+ elif overall_score >= 0.3:
1046
  feedback.append(
1047
+ "🔄 Keep working on it! Focus on clarity and the specific sounds mentioned."
1048
  )
1049
  else:
1050
+ feedback.append(
1051
+ "💪 Don't give up! Start with slower, clearer pronunciation."
1052
+ )
1053
+
1054
+ # Audio quality feedback
1055
+ audio_quality = features.get("rms_mean", 0)
1056
+ if audio_quality < 0.01:
1057
+ feedback.append(
1058
+ "🔊 Try speaking louder and more clearly - your recording was quite quiet."
1059
+ )
1060
+ elif audio_quality > 0.15:
1061
+ feedback.append("🔉 Good volume level! Your voice comes through clearly.")
1062
+
1063
+ # Pitch variation feedback
1064
+ pitch_std = features.get("f0_std", 0)
1065
+ if pitch_std < 20:
1066
+ feedback.append(
1067
+ "🎵 Try adding more natural pitch variation to sound more engaging."
1068
+ )
1069
+ elif pitch_std > 80:
1070
+ feedback.append(
1071
+ "🎵 Good pitch variation! Your speech sounds natural and expressive."
1072
+ )
1073
 
1074
+ # Word-specific feedback
1075
+ poor_words = [wr for wr in word_results if wr["score"] < 0.5]
1076
+ if poor_words:
1077
+ word_names = [w["word"] for w in poor_words]
1078
+ feedback.append(f"🎯 Focus extra practice on: {', '.join(word_names)}")
1079
 
1080
  # Phoneme-specific feedback for Vietnamese speakers
1081
+ all_challenging = []
1082
+ for word_result in word_results:
1083
+ for phoneme_detail in word_result.get("phoneme_details", []):
1084
+ if phoneme_detail["score"] < 0.5 and phoneme_detail["difficulty"] > 0.6:
1085
+ all_challenging.append(phoneme_detail["phoneme"])
1086
+
1087
+ if all_challenging:
1088
+ unique_challenging = list(set(all_challenging))
1089
+ vietnamese_tips = {
1090
+ "TH": "Put your tongue between your teeth and blow air gently",
1091
+ "DH": "Same tongue position as TH, but vibrate your vocal cords",
1092
+ "V": "Touch your bottom lip to your top teeth, then voice",
1093
+ "R": "Curl your tongue without touching the roof of your mouth",
1094
+ "L": "Touch your tongue tip to the roof of your mouth",
1095
+ "Z": "Like 'S' but with vocal cord vibration",
1096
+ }
1097
 
1098
+ for phoneme in unique_challenging[:3]: # Top 3 challenging
1099
+ clean_phoneme = re.sub(r"[0-9]", "", phoneme)
1100
+ if clean_phoneme in vietnamese_tips:
1101
+ feedback.append(
1102
+ f"🔤 {clean_phoneme} sound: {vietnamese_tips[clean_phoneme]}"
1103
+ )
1104
+
1105
+ # Difficulty-based encouragement
1106
+ text_difficulty = text_analysis["difficulty_score"]
1107
+ if text_difficulty > 0.7 and overall_score > 0.6:
1108
+ feedback.append(
1109
+ "💪 Impressive! You tackled some very challenging sounds for Vietnamese speakers."
1110
  )
1111
+ elif text_difficulty < 0.3 and overall_score < 0.7:
1112
+ feedback.append("📈 Try some more challenging words as you improve!")
 
1113
 
1114
  return feedback
1115
 
1116
+ def _analyze_difficulty_performance(
1117
+ self, word_results: List[Dict], text_analysis: Dict
1118
+ ) -> Dict:
1119
+ """Analyze performance vs difficulty"""
1120
+ easy_phonemes = [] # difficulty < 0.4
1121
+ medium_phonemes = [] # 0.4 <= difficulty < 0.7
1122
+ hard_phonemes = [] # difficulty >= 0.7
1123
+
1124
+ for word_result in word_results:
1125
+ for phoneme_detail in word_result.get("phoneme_details", []):
1126
+ difficulty = phoneme_detail["difficulty"]
1127
+ score = phoneme_detail["score"]
1128
+
1129
+ if difficulty < 0.4:
1130
+ easy_phonemes.append(score)
1131
+ elif difficulty < 0.7:
1132
+ medium_phonemes.append(score)
1133
+ else:
1134
+ hard_phonemes.append(score)
1135
+
1136
+ return {
1137
+ "easy_sounds_avg": float(np.mean(easy_phonemes)) if easy_phonemes else 0.0,
1138
+ "medium_sounds_avg": (
1139
+ float(np.mean(medium_phonemes)) if medium_phonemes else 0.0
1140
+ ),
1141
+ "hard_sounds_avg": float(np.mean(hard_phonemes)) if hard_phonemes else 0.0,
1142
+ "total_challenging_sounds": len(hard_phonemes),
1143
+ "mastered_difficult_sounds": len([s for s in hard_phonemes if s > 0.7]),
1144
+ "text_difficulty": text_analysis["difficulty_score"],
1145
+ }
1146
+
1147
  def _get_word_status(self, score: float) -> str:
1148
  """Get word status from score"""
1149
  if score >= 0.8:
 
1155
  else:
1156
  return "poor"
1157
 
1158
+ def _get_status(self, score: float) -> str:
1159
+ """Get overall status"""
1160
+ return self._get_word_status(score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1161
 
1162
 
1163
  # =============================================================================
1164
+ # ENHANCED FASTAPI APP
1165
  # =============================================================================
1166
 
 
 
1167
 
1168
+ # Initialize enhanced processor
1169
+ assessor = EnhancedPronunciationAssessor()
1170
 
1171
+ # =============================================================================
1172
+ # ENHANCED ENDPOINTS
1173
+ # =============================================================================
 
 
 
 
 
 
 
 
 
 
 
1174
 
1175
 
1176
+ @router.post("/assess", response_model=PronunciationResult)
1177
  async def assess_pronunciation(
1178
+ audio: UploadFile = File(..., description="Audio file"),
1179
+ reference_text: str = Form(..., description="Any English text"),
1180
+ difficulty_level: str = Form("medium", description="easy, medium, hard"),
1181
  ):
1182
  """
1183
+ Assess pronunciation for ANY English text
1184
+ Supports 60,000+ words from CMU Pronouncing Dictionary
 
 
 
 
 
 
 
 
1185
  """
1186
 
1187
  import time
1188
 
1189
  start_time = time.time()
1190
+ print(f"Starting pronunciation assessment...")
1191
+ print("Reference text:", reference_text)
1192
+ print("Difficulty level:", difficulty_level)
1193
+ print("Audio filename:", audio.filename if audio else "None")
1194
+
1195
  # Validate inputs
1196
  if not reference_text.strip():
1197
+ print("Validation failed: Reference text is empty")
1198
  raise HTTPException(status_code=400, detail="Reference text cannot be empty")
1199
 
1200
+ if len(reference_text) > 1000:
1201
+ print("Validation failed: Reference text too long")
1202
  raise HTTPException(
1203
+ status_code=400, detail="Reference text too long (max 1000 characters)"
1204
  )
1205
 
1206
+ # Check if text contains only valid characters
1207
+ # Updated regex to be more permissive and include common punctuation like commas
1208
  if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text):
1209
+ print("Validation failed: Invalid characters in text")
1210
+ print("Text that failed validation:", repr(reference_text))
1211
  raise HTTPException(
1212
  status_code=400,
1213
+ detail="Text contains invalid characters. Only English letters, spaces, and basic punctuation (,.'-!?;:) allowed.",
1214
  )
1215
 
1216
  try:
1217
+ # Save uploaded file
1218
+ print("Saving uploaded file...")
1219
+ # Handle cases where filename might be None or empty
1220
  file_extension = ".wav"
1221
+ if audio.filename:
1222
+ file_extension = f".{audio.filename.split('.')[-1]}" if '.' in audio.filename else ".wav"
1223
+
1224
  with tempfile.NamedTemporaryFile(
1225
  delete=False, suffix=file_extension
1226
  ) as tmp_file:
1227
  content = await audio.read()
1228
  tmp_file.write(content)
1229
  tmp_file.flush()
1230
+ print("File saved to:", tmp_file.name)
1231
+ print("File size:", len(content), "bytes")
1232
 
1233
+ # Process with enhanced assessor
1234
+ print("Processing audio file...")
1235
+ result = assessor.process_audio_file(tmp_file.name, reference_text)
1236
+ print("Audio processing completed")
1237
 
1238
+ # Clean up
 
 
 
1239
  os.unlink(tmp_file.name)
1240
 
1241
+ # Apply difficulty adjustments
1242
+ analysis = result["pronunciation_analysis"]
1243
+ if difficulty_level == "easy":
1244
+ analysis["overall_score"] = min(1.0, analysis["overall_score"] * 1.2)
1245
+ for word in analysis["words"]:
1246
+ word["score"] = min(1.0, word["score"] * 1.2)
1247
+ elif difficulty_level == "hard":
1248
+ analysis["overall_score"] = analysis["overall_score"] * 0.8
1249
+ for word in analysis["words"]:
1250
+ word["score"] = word["score"] * 0.8
1251
 
1252
  processing_time = time.time() - start_time
1253
+ print("Processing completed successfully in", processing_time, "seconds")
1254
+
1255
+ return PronunciationResult(
1256
+ overall_score=analysis["overall_score"],
1257
+ status=analysis["status"],
1258
+ feedback=analysis["feedback"],
1259
+ words=analysis["words"],
1260
+ phoneme_details=analysis["phoneme_details"],
1261
+ audio_info=result["audio_info"],
1262
+ processing_time=processing_time,
1263
+ difficulty_analysis=analysis["difficulty_analysis"],
1264
+ )
1265
 
1266
  except Exception as e:
1267
+ print("Exception occurred during processing:", str(e))
1268
  import traceback
 
1269
  traceback.print_exc()
1270
+ raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
 
 
 
 
 
1271
 
1272
 
1273
  @router.get("/phonemes/{word}")
1274
  async def get_word_phonemes(word: str):
1275
+ """Get comprehensive phoneme information for ANY English word"""
1276
  try:
1277
+ word_info = assessor.phoneme_processor.get_word_phonemes(word)
1278
 
1279
+ # Calculate difficulty for Vietnamese speakers
1280
+ difficulty = assessor.phoneme_processor.get_difficulty_score(word_info.phonemes)
 
 
 
1281
 
1282
+ # Get challenging phonemes
1283
+ challenging_phonemes = []
1284
+ for phoneme in word_info.phonemes:
1285
+ clean_phoneme = re.sub(r"[0-9]", "", phoneme)
1286
+ phoneme_difficulty = assessor.phoneme_processor.difficulty_map.get(
1287
+ clean_phoneme, 0
1288
+ )
1289
+ if phoneme_difficulty > 0.6:
1290
+ challenging_phonemes.append(
1291
+ {
1292
+ "phoneme": clean_phoneme,
1293
+ "difficulty": phoneme_difficulty,
1294
+ "tips": get_phoneme_tips(clean_phoneme),
1295
+ }
1296
+ )
1297
 
1298
  return {
1299
  "word": word,
1300
+ "phonemes": word_info.phonemes,
1301
+ "ipa_transcription": word_info.ipa_transcription,
1302
+ "syllables": word_info.syllables,
1303
+ "stress_pattern": word_info.stress_pattern,
1304
+ "difficulty_score": difficulty,
1305
+ "difficulty_level": (
1306
+ "hard" if difficulty > 0.7 else "medium" if difficulty > 0.4 else "easy"
1307
+ ),
1308
+ "challenging_phonemes": challenging_phonemes,
1309
+ "pronunciation_tips": get_word_pronunciation_tips(word, word_info.phonemes),
1310
+ }
1311
+
1312
+ except Exception as e:
1313
+ raise HTTPException(status_code=500, detail=f"Error processing word: {str(e)}")
1314
+
1315
+
1316
+ @router.post("/analyze/text")
1317
+ async def analyze_text_difficulty(text: str = Form(...)):
1318
+ """Analyze pronunciation difficulty of any English text"""
1319
+ try:
1320
+ text_analysis = assessor._analyze_text(text)
1321
+
1322
+ return {
1323
+ "text": text,
1324
+ "word_count": len(text_analysis["words"]),
1325
+ "total_phonemes": text_analysis["total_phonemes"],
1326
+ "overall_difficulty": text_analysis["difficulty_score"],
1327
  "difficulty_level": (
1328
  "hard"
1329
+ if text_analysis["difficulty_score"] > 0.7
1330
+ else "medium" if text_analysis["difficulty_score"] > 0.4 else "easy"
1331
  ),
1332
+ "challenging_sounds": text_analysis["challenging_sounds"],
1333
+ "word_breakdown": text_analysis["words"],
1334
+ "recommendations": get_text_recommendations(text_analysis),
 
 
 
 
 
1335
  }
1336
 
1337
  except Exception as e:
1338
+ raise HTTPException(status_code=500, detail=f"Text analysis error: {str(e)}")
1339
+
1340
+
1341
+ @router.get("/dictionary/search")
1342
+ async def search_dictionary(query: str, limit: int = 20):
1343
+ """Search CMU dictionary for words containing query"""
1344
+ try:
1345
+ cmu_dict = assessor.phoneme_processor.cmu_dict
1346
+
1347
+ # Search for words containing the query
1348
+ matching_words = []
1349
+ query_lower = query.lower()
1350
+
1351
+ for word in cmu_dict.keys():
1352
+ if query_lower in word and len(matching_words) < limit:
1353
+ word_info = assessor.phoneme_processor.get_word_phonemes(word)
1354
+ difficulty = assessor.phoneme_processor.get_difficulty_score(
1355
+ word_info.phonemes
1356
+ )
1357
+
1358
+ matching_words.append(
1359
+ {
1360
+ "word": word,
1361
+ "phonemes": word_info.phonemes,
1362
+ "ipa": word_info.ipa_transcription,
1363
+ "difficulty": difficulty,
1364
+ "difficulty_level": (
1365
+ "hard"
1366
+ if difficulty > 0.7
1367
+ else "medium" if difficulty > 0.4 else "easy"
1368
+ ),
1369
+ }
1370
+ )
1371
+
1372
+ # Sort by difficulty (easiest first)
1373
+ matching_words.sort(key=lambda x: x["difficulty"])
1374
 
1375
+ return {"query": query, "found": len(matching_words), "words": matching_words}
1376
+
1377
+ except Exception as e:
1378
+ raise HTTPException(
1379
+ status_code=500, detail=f"Dictionary search error: {str(e)}"
1380
+ )
1381
+
1382
+
1383
+ @router.get("/practice/level/{level}")
1384
+ async def get_practice_words(level: str, count: int = 10):
1385
+ """Get practice words by difficulty level"""
1386
+
1387
+ if level not in ["easy", "medium", "hard"]:
1388
+ raise HTTPException(
1389
+ status_code=400, detail="Level must be easy, medium, or hard"
1390
+ )
1391
+
1392
+ try:
1393
+ cmu_dict = assessor.phoneme_processor.cmu_dict
1394
+ practice_words = []
1395
+
1396
+ # Define difficulty ranges
1397
+ if level == "easy":
1398
+ difficulty_range = (0, 0.4)
1399
+ elif level == "medium":
1400
+ difficulty_range = (0.4, 0.7)
1401
+ else: # hard
1402
+ difficulty_range = (0.7, 1.0)
1403
+
1404
+ # Sample words from dictionary
1405
+ word_list = list(cmu_dict.keys())
1406
+ np.random.shuffle(word_list)
1407
+
1408
+ for word in word_list:
1409
+ if len(practice_words) >= count:
1410
+ break
1411
+
1412
+ # Skip very short or very long words
1413
+ if len(word) < 3 or len(word) > 12:
1414
+ continue
1415
+
1416
+ # Skip words with special characters
1417
+ if not word.isalpha():
1418
+ continue
1419
+
1420
+ word_info = assessor.phoneme_processor.get_word_phonemes(word)
1421
+ difficulty = assessor.phoneme_processor.get_difficulty_score(
1422
+ word_info.phonemes
1423
+ )
1424
+
1425
+ if difficulty_range[0] <= difficulty <= difficulty_range[1]:
1426
+ practice_words.append(
1427
+ {
1428
+ "word": word,
1429
+ "phonemes": word_info.phonemes,
1430
+ "ipa": word_info.ipa_transcription,
1431
+ "difficulty": difficulty,
1432
+ "tips": get_word_pronunciation_tips(word, word_info.phonemes),
1433
+ }
1434
+ )
1435
+
1436
+ return {
1437
+ "level": level,
1438
+ "difficulty_range": difficulty_range,
1439
+ "count": len(practice_words),
1440
+ "words": practice_words,
1441
+ }
1442
+
1443
+ except Exception as e:
1444
+ raise HTTPException(status_code=500, detail=f"Practice words error: {str(e)}")
1445
+
1446
+
1447
+ # =============================================================================
1448
+ # HELPER FUNCTIONS
1449
+ # =============================================================================
1450
+
1451
+
1452
+ def get_phoneme_tips(phoneme: str) -> List[str]:
1453
+ """Get pronunciation tips for specific phonemes"""
1454
+ tips_dict = {
1455
+ "TH": [
1456
+ "Place tongue tip between upper and lower teeth",
1457
+ "Blow air gently while keeping tongue in position",
1458
+ "Should feel air flowing over tongue",
1459
+ ],
1460
+ "DH": [
1461
+ "Same tongue position as TH",
1462
+ "Add vocal cord vibration",
1463
+ "Should feel buzzing in throat",
1464
+ ],
1465
+ "V": [
1466
+ "Touch bottom lip to upper teeth",
1467
+ "Voice while air flows through the gap",
1468
+ "Don't use both lips like Vietnamese 'V'",
1469
+ ],
1470
+ "R": [
1471
+ "Curl tongue without touching roof of mouth",
1472
+ "Don't roll the R like in Vietnamese",
1473
+ "Tongue should float freely",
1474
+ ],
1475
+ "L": [
1476
+ "Touch tongue tip to roof of mouth behind teeth",
1477
+ "Let air flow around sides of tongue",
1478
+ "Make sure tongue actually touches",
1479
+ ],
1480
+ "Z": [
1481
+ "Same tongue position as 'S'",
1482
+ "Add vocal cord vibration",
1483
+ "Should buzz like a bee",
1484
+ ],
1485
+ }
1486
+
1487
+ return tips_dict.get(phoneme, ["Practice this sound slowly and clearly"])
1488
+
1489
+
1490
+ def get_word_pronunciation_tips(word: str, phonemes: List[str]) -> List[str]:
1491
+ """Get word-specific pronunciation tips"""
1492
+ tips = []
1493
+
1494
+ # Check for challenging combinations
1495
+ phoneme_str = " ".join(phonemes)
1496
+
1497
+ # Consonant clusters
1498
+ if "S T" in phoneme_str or "S K" in phoneme_str or "S P" in phoneme_str:
1499
+ tips.append("Practice the consonant cluster slowly, then speed up")
1500
+
1501
+ # TH sounds
1502
+ if "TH" in phonemes:
1503
+ tips.append("Remember: tongue between teeth for TH sounds")
1504
+
1505
+ # R and L distinction
1506
+ if "R" in phonemes and "L" in phonemes:
1507
+ tips.append("Focus on R (no touching) vs L (tongue touches roof)")
1508
+
1509
+ # Final consonants (Vietnamese tendency to drop)
1510
+ final_phoneme = phonemes[-1] if phonemes else ""
1511
+ if final_phoneme in ["T", "D", "K", "G", "P", "B"]:
1512
+ tips.append("Don't forget the final consonant sound")
1513
+
1514
+ # Vowel length
1515
+ vowel_phonemes = [
1516
+ p for p in phonemes if re.sub(r"[0-9]", "", p) in ["IY", "UW", "AO"]
1517
+ ]
1518
+ if vowel_phonemes:
1519
+ tips.append("Make sure long vowels are actually longer")
1520
+
1521
+ if not tips:
1522
+ tips.append("Break the word into syllables and practice each part")
1523
+
1524
+ return tips
1525
+
1526
+
1527
+ def get_text_recommendations(text_analysis: Dict) -> List[str]:
1528
+ """Get recommendations based on text analysis"""
1529
+ recommendations = []
1530
+
1531
+ difficulty = text_analysis["difficulty_score"]
1532
+
1533
+ if difficulty < 0.3:
1534
+ recommendations.append(
1535
+ "This text is good for beginners. Try adding more challenging words gradually."
1536
+ )
1537
+ elif difficulty > 0.8:
1538
+ recommendations.append(
1539
+ "This is very challenging text. Consider starting with easier words first."
1540
+ )
1541
+
1542
+ challenging_sounds = text_analysis["challenging_sounds"]
1543
+ if len(challenging_sounds) > 5:
1544
+ recommendations.append(
1545
+ "This text has many challenging sounds. Practice individual words first."
1546
+ )
1547
+
1548
+ # Word length recommendations
1549
+ long_words = [w for w in text_analysis["words"] if len(w["phonemes"]) > 8]
1550
+ if long_words:
1551
+ recommendations.append(
1552
+ "Break down longer words into syllables for easier practice."
1553
+ )
1554
+
1555
+ return recommendations
1556
+
1557
+
1558
+ # =============================================================================
1559
+ # ADDITIONAL ENDPOINTS
1560
+ # =============================================================================
1561
+
1562
+
1563
+ @router.get("/stats")
1564
+ async def get_system_stats():
1565
+ """Get system statistics"""
1566
+ cmu_dict = assessor.phoneme_processor.cmu_dict
1567
+
1568
+ return {
1569
+ "total_words_supported": len(cmu_dict),
1570
+ "phonemes_supported": len(assessor.phoneme_processor.phoneme_models),
1571
+ "difficulty_levels": ["easy", "medium", "hard"],
1572
+ "audio_formats_supported": ["wav", "mp3", "m4a", "flac"],
1573
+ "max_audio_duration": "30 seconds",
1574
+ "vietnamese_specific_features": True,
1575
+ "features": [
1576
+ "CMU Pronouncing Dictionary integration",
1577
+ "IPA transcription",
1578
+ "Syllable analysis",
1579
+ "Contextual phoneme scoring",
1580
+ "Vietnamese learner optimization",
1581
+ ],
1582
+ }
1583
+
1584
+
1585
+ @router.get("/phonemes/difficult")
1586
+ async def get_difficult_phonemes_for_vietnamese():
1587
+ """Get phonemes that are most difficult for Vietnamese speakers"""
1588
+ difficult_phonemes = []
1589
+
1590
+ for phoneme, difficulty in assessor.phoneme_processor.difficulty_map.items():
1591
+ if difficulty > 0.6: # Only include challenging ones
1592
+ difficult_phonemes.append(
1593
+ {
1594
+ "phoneme": phoneme,
1595
+ "difficulty": difficulty,
1596
+ "tips": get_phoneme_tips(phoneme),
1597
+ "example_words": get_example_words(phoneme),
1598
+ }
1599
+ )
1600
+
1601
+ # Sort by difficulty (hardest first)
1602
+ difficult_phonemes.sort(key=lambda x: x["difficulty"], reverse=True)
1603
 
 
 
 
1604
  return {
1605
+ "difficult_phonemes": difficult_phonemes,
1606
+ "total_count": len(difficult_phonemes),
1607
+ "recommendation": "Focus on the top 5 most difficult sounds first",
 
1608
  }
1609
+
1610
+
1611
+ def get_example_words(phoneme: str) -> List[str]:
1612
+ """Get example words containing the phoneme"""
1613
+ examples = {
1614
+ "TH": ["think", "three", "math", "path"],
1615
+ "DH": ["this", "that", "mother", "weather"],
1616
+ "V": ["very", "love", "give", "have"],
1617
+ "Z": ["zoo", "zero", "buzz", "rise"],
1618
+ "R": ["red", "car", "very", "right"],
1619
+ "L": ["love", "hello", "well", "people"],
1620
+ "W": ["water", "well", "what", "sweet"],
1621
+ "ZH": ["measure", "vision", "treasure"],
1622
+ "CH": ["chair", "much", "teach"],
1623
+ "JH": ["job", "bridge", "age"],
1624
+ "SH": ["shoe", "fish", "nation"],
1625
+ "NG": ["ring", "thing", "young"],
1626
+ }
1627
+
1628
+ return examples.get(phoneme, [f"word_with_{phoneme.lower()}"])