ABAO77 commited on
Commit
c6480d4
·
1 Parent(s): dd47219

Implement code changes to enhance functionality and improve performance

Browse files
src/apis/__pycache__/create_app.cpython-311.pyc CHANGED
Binary files a/src/apis/__pycache__/create_app.cpython-311.pyc and b/src/apis/__pycache__/create_app.cpython-311.pyc differ
 
src/apis/routes/speaking_route.py CHANGED
@@ -1,29 +1,28 @@
1
- # ENHANCED PRONUNCIATION API - MULTI-WORD SUPPORT
2
- # Supports any English word using CMU Dict + phoneme libraries
 
3
 
4
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException, APIRouter
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from pydantic import BaseModel
7
- from typing import List, Dict, Optional, Tuple
8
  import tempfile
9
  import os
10
  import numpy as np
11
  import librosa
12
  import nltk
13
  import eng_to_ipa as ipa
14
- import pronouncing
15
- import requests
16
- import json
17
  import re
18
  from collections import defaultdict
19
  import warnings
 
20
 
21
  warnings.filterwarnings("ignore")
22
 
23
  # Download required NLTK data
24
  try:
25
  nltk.download("cmudict", quiet=True)
26
- nltk.download("punkt", quiet=True)
27
  from nltk.corpus import cmudict
28
  except:
29
  print("Warning: NLTK data not available")
@@ -31,1119 +30,495 @@ except:
31
  # =============================================================================
32
  # MODELS
33
  # =============================================================================
34
- router = APIRouter(prefix="/speaking", tags=["AI"])
35
 
 
36
 
37
- class PronunciationResult(BaseModel):
 
 
 
 
38
  overall_score: float
39
- status: str
 
 
40
  feedback: List[str]
41
- words: List[Dict]
42
- phoneme_details: List[Dict]
43
- audio_info: Dict
44
- processing_time: float
45
- difficulty_analysis: Dict
46
-
47
-
48
- class WordPhonemeInfo(BaseModel):
49
- word: str
50
- phonemes: List[str]
51
- ipa_transcription: str
52
- syllables: List[str]
53
- stress_pattern: List[int]
54
-
55
 
56
  # =============================================================================
57
- # ENHANCED PHONEME PROCESSOR
58
  # =============================================================================
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- class EnhancedPhonemeProcessor:
62
- """Advanced phoneme processing with multiple dictionaries"""
 
63
 
 
 
 
64
  def __init__(self):
65
- self.sample_rate = 16000
66
-
67
- # Load CMU dictionary
68
  try:
69
  self.cmu_dict = cmudict.dict()
70
  except:
71
  self.cmu_dict = {}
72
  print("Warning: CMU dictionary not available")
73
-
74
- # Load comprehensive phoneme acoustic models
75
- self.phoneme_models = self._load_comprehensive_phoneme_models()
76
-
77
- # Phoneme difficulty for Vietnamese speakers
78
- self.difficulty_map = {
79
- # Very difficult for Vietnamese
80
- "TH": 0.9, # think, that
81
- "DH": 0.9, # this, then
82
- "V": 0.8, # very, love
83
- "Z": 0.8, # zoo, rise
84
- "ZH": 0.9, # measure, vision
85
- "R": 0.7, # red, car
86
- "L": 0.6, # love, well
87
- "W": 0.5, # water, well
88
- # Moderately difficult
89
- "F": 0.4, # fish, life
90
- "S": 0.3, # see, this
91
- "SH": 0.5, # shoe, fish
92
- "CH": 0.4, # chair, much
93
- "JH": 0.5, # job, bridge
94
- # Vowels - challenging distinctions
95
- "IY": 0.3, # beat
96
- "IH": 0.6, # bit
97
- "EY": 0.4, # bait
98
- "EH": 0.5, # bet
99
- "AE": 0.7, # bat
100
- "AH": 0.4, # but
101
- "AO": 0.6, # bought
102
- "OW": 0.4, # boat
103
- "UH": 0.6, # book
104
- "UW": 0.4, # boot
105
- # Easier sounds
106
- "P": 0.2,
107
- "B": 0.2,
108
- "T": 0.2,
109
- "D": 0.2,
110
- "K": 0.2,
111
- "G": 0.2,
112
- "M": 0.2,
113
- "N": 0.2,
114
- "NG": 0.3,
115
- }
116
-
117
- def get_word_phonemes(self, word: str) -> WordPhonemeInfo:
118
- """Get comprehensive phoneme info for any English word"""
119
- word_lower = word.lower().strip()
120
-
121
- # Method 1: CMU Dictionary (most reliable)
122
- cmu_phonemes = []
123
  if word_lower in self.cmu_dict:
124
- # Get first pronunciation variant
125
- cmu_phonemes = self.cmu_dict[word_lower][0]
126
- # Remove stress markers (0,1,2) from vowels
127
- cmu_phonemes = [re.sub(r"[0-9]", "", p) for p in cmu_phonemes]
128
-
129
- # Method 2: eng_to_ipa library
130
- ipa_transcription = ""
131
- try:
132
- ipa_transcription = ipa.convert(word)
133
- except:
134
- ipa_transcription = f"/{word}/"
135
-
136
- # Method 3: pronouncing library for syllables
137
- syllables = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  try:
139
- syllable_count = pronouncing.syllable_count(word)
140
- # Simple syllable division
141
- if syllable_count and len(word) > syllable_count:
142
- syllable_length = len(word) // syllable_count
143
- syllables = [
144
- word[i : i + syllable_length]
145
- for i in range(0, len(word), syllable_length)
146
- ]
147
- else:
148
- syllables = [word]
149
  except:
150
- syllables = [word]
151
-
152
- # Extract stress pattern from CMU
153
- stress_pattern = []
154
- if word_lower in self.cmu_dict:
155
- for phoneme in self.cmu_dict[word_lower][0]:
156
- stress = re.findall(r"[0-9]", phoneme)
157
- if stress:
158
- stress_pattern.append(int(stress[0]))
159
-
160
- # Fallback phonemes if CMU not available
161
- if not cmu_phonemes:
162
- cmu_phonemes = self._estimate_phonemes(word)
163
-
164
- return WordPhonemeInfo(
165
- word=word,
166
- phonemes=cmu_phonemes,
167
- ipa_transcription=ipa_transcription,
168
- syllables=syllables,
169
- stress_pattern=stress_pattern,
170
- )
171
-
172
  def _estimate_phonemes(self, word: str) -> List[str]:
173
  """Estimate phonemes for unknown words"""
174
- # Simple grapheme-to-phoneme mapping
175
  phoneme_map = {
176
- "ch": ["CH"],
177
- "sh": ["SH"],
178
- "th": ["TH"],
179
- "ph": ["F"],
180
- "ck": ["K"],
181
- "ng": ["NG"],
182
- "qu": ["K", "W"],
183
- "a": ["AE"],
184
- "e": ["EH"],
185
- "i": ["IH"],
186
- "o": ["AH"],
187
- "u": ["AH"],
188
- "b": ["B"],
189
- "c": ["K"],
190
- "d": ["D"],
191
- "f": ["F"],
192
- "g": ["G"],
193
- "h": ["HH"],
194
- "j": ["JH"],
195
- "k": ["K"],
196
- "l": ["L"],
197
- "m": ["M"],
198
- "n": ["N"],
199
- "p": ["P"],
200
- "r": ["R"],
201
- "s": ["S"],
202
- "t": ["T"],
203
- "v": ["V"],
204
- "w": ["W"],
205
- "x": ["K", "S"],
206
- "y": ["Y"],
207
- "z": ["Z"],
208
  }
209
-
210
  word = word.lower()
211
  phonemes = []
212
  i = 0
213
-
214
  while i < len(word):
215
  # Check 2-letter combinations first
216
- if i < len(word) - 1:
217
- two_char = word[i : i + 2]
218
  if two_char in phoneme_map:
219
  phonemes.extend(phoneme_map[two_char])
220
  i += 2
221
  continue
222
-
223
  # Single character
224
  char = word[i]
225
  if char in phoneme_map:
226
  phonemes.extend(phoneme_map[char])
227
-
228
  i += 1
229
-
230
  return phonemes
231
 
232
- def _load_comprehensive_phoneme_models(self) -> Dict:
233
- """Load comprehensive phoneme acoustic models"""
234
- # Extended phoneme set với acoustic characteristics
235
- models = {}
236
-
237
- # VOWELS
238
- vowel_models = {
239
- "IY": {"f1": 270, "f2": 2300, "duration": 150, "type": "vowel"}, # beat
240
- "IH": {"f1": 390, "f2": 1990, "duration": 120, "type": "vowel"}, # bit
241
- "EY": {"f1": 400, "f2": 2100, "duration": 160, "type": "vowel"}, # bait
242
- "EH": {"f1": 550, "f2": 1770, "duration": 130, "type": "vowel"}, # bet
243
- "AE": {"f1": 690, "f2": 1660, "duration": 140, "type": "vowel"}, # bat
244
- "AH": {"f1": 640, "f2": 1190, "duration": 110, "type": "vowel"}, # but
245
- "AO": {"f1": 570, "f2": 840, "duration": 150, "type": "vowel"}, # bought
246
- "OW": {"f1": 430, "f2": 1020, "duration": 160, "type": "vowel"}, # boat
247
- "UH": {"f1": 450, "f2": 1030, "duration": 120, "type": "vowel"}, # book
248
- "UW": {"f1": 310, "f2": 870, "duration": 150, "type": "vowel"}, # boot
249
- "ER": {"f1": 490, "f2": 1350, "duration": 140, "type": "vowel"}, # bird
250
- "AY": {"f1": 640, "f2": 1190, "duration": 180, "type": "vowel"}, # bite
251
- "AW": {"f1": 640, "f2": 1190, "duration": 180, "type": "vowel"}, # bout
252
- "OY": {"f1": 570, "f2": 840, "duration": 180, "type": "vowel"}, # boy
253
- }
254
-
255
- # CONSONANTS
256
- consonant_models = {
257
- # Stops
258
- "P": {
259
- "burst_energy": 0.8,
260
- "duration": 80,
261
- "type": "stop",
262
- "voicing": False,
263
- },
264
- "B": {"burst_energy": 0.7, "duration": 85, "type": "stop", "voicing": True},
265
- "T": {
266
- "burst_energy": 0.9,
267
- "duration": 75,
268
- "type": "stop",
269
- "voicing": False,
270
- },
271
- "D": {
272
- "burst_energy": 0.75,
273
- "duration": 80,
274
- "type": "stop",
275
- "voicing": True,
276
- },
277
- "K": {
278
- "burst_energy": 0.85,
279
- "duration": 70,
280
- "type": "stop",
281
- "voicing": False,
282
- },
283
- "G": {"burst_energy": 0.7, "duration": 75, "type": "stop", "voicing": True},
284
- # Fricatives (challenging for Vietnamese)
285
- "F": {
286
- "high_freq": True,
287
- "duration": 120,
288
- "type": "fricative",
289
- "voicing": False,
290
- },
291
- "V": {
292
- "high_freq": True,
293
- "duration": 110,
294
- "type": "fricative",
295
- "voicing": True,
296
- },
297
- "TH": {
298
- "high_freq": True,
299
- "duration": 130,
300
- "type": "fricative",
301
- "voicing": False,
302
- }, # think
303
- "DH": {
304
- "high_freq": True,
305
- "duration": 120,
306
- "type": "fricative",
307
- "voicing": True,
308
- }, # this
309
- "S": {
310
- "very_high_freq": True,
311
- "duration": 140,
312
- "type": "fricative",
313
- "voicing": False,
314
- },
315
- "Z": {
316
- "very_high_freq": True,
317
- "duration": 130,
318
- "type": "fricative",
319
- "voicing": True,
320
- },
321
- "SH": {
322
- "high_freq": True,
323
- "duration": 150,
324
- "type": "fricative",
325
- "voicing": False,
326
- }, # shoe
327
- "ZH": {
328
- "high_freq": True,
329
- "duration": 140,
330
- "type": "fricative",
331
- "voicing": True,
332
- }, # measure
333
- "HH": {
334
- "breathy": True,
335
- "duration": 100,
336
- "type": "fricative",
337
- "voicing": False,
338
- }, # hello
339
- # Affricates
340
- "CH": {
341
- "burst_fricative": True,
342
- "duration": 160,
343
- "type": "affricate",
344
- "voicing": False,
345
- }, # chair
346
- "JH": {
347
- "burst_fricative": True,
348
- "duration": 150,
349
- "type": "affricate",
350
- "voicing": True,
351
- }, # job
352
- # Nasals
353
- "M": {"nasal": True, "duration": 100, "type": "nasal", "voicing": True},
354
- "N": {"nasal": True, "duration": 95, "type": "nasal", "voicing": True},
355
- "NG": {
356
- "nasal": True,
357
- "duration": 105,
358
- "type": "nasal",
359
- "voicing": True,
360
- }, # ring
361
- # Liquids (challenging L/R distinction)
362
- "L": {"lateral": True, "duration": 90, "type": "liquid", "voicing": True},
363
- "R": {"retroflex": True, "duration": 95, "type": "liquid", "voicing": True},
364
- # Glides
365
- "Y": {"glide": True, "duration": 70, "type": "glide", "voicing": True},
366
- "W": {"glide": True, "duration": 75, "type": "glide", "voicing": True},
367
- }
368
-
369
- # Combine models
370
- models.update(vowel_models)
371
- models.update(consonant_models)
372
-
373
- return models
374
-
375
- def get_difficulty_score(self, phonemes: List[str]) -> float:
376
- """Calculate difficulty score for Vietnamese speakers"""
377
- if not phonemes:
378
- return 0.5
379
-
380
- difficulties = []
381
- for phoneme in phonemes:
382
- # Remove stress markers
383
- clean_phoneme = re.sub(r"[0-9]", "", phoneme)
384
- difficulty = self.difficulty_map.get(clean_phoneme, 0.3)
385
- difficulties.append(difficulty)
386
-
387
- return np.mean(difficulties)
388
-
389
- def score_phoneme_advanced(
390
- self, phoneme: str, segment_features: Dict, context: Dict = None
391
- ) -> float:
392
- """Advanced phoneme scoring với context"""
393
- clean_phoneme = re.sub(r"[0-9]", "", phoneme)
394
-
395
- if clean_phoneme not in self.phoneme_models:
396
- return 0.5 # Unknown phoneme
397
-
398
- model = self.phoneme_models[clean_phoneme]
399
- score = 0.0
400
-
401
- # Type-specific scoring
402
- if model["type"] == "vowel":
403
- score = self._score_vowel(clean_phoneme, segment_features, model)
404
- elif model["type"] == "fricative":
405
- score = self._score_fricative(clean_phoneme, segment_features, model)
406
- elif model["type"] == "stop":
407
- score = self._score_stop(clean_phoneme, segment_features, model)
408
- elif model["type"] in ["liquid", "nasal", "glide", "affricate"]:
409
- score = self._score_other_consonant(clean_phoneme, segment_features, model)
410
-
411
- # Context adjustments
412
- if context:
413
- score = self._apply_context_adjustments(score, clean_phoneme, context)
414
-
415
- # Difficulty adjustment for Vietnamese speakers
416
- difficulty = self.difficulty_map.get(clean_phoneme, 0.3)
417
- # Easier scoring for more difficult phonemes
418
- adjusted_score = score + (difficulty * 0.1)
419
-
420
- return np.clip(adjusted_score, 0, 1)
421
-
422
- def _score_vowel(self, phoneme: str, features: Dict, model: Dict) -> float:
423
- """Score vowel phoneme"""
424
- score = 0.0
425
-
426
- # Energy check (vowels should have good energy)
427
- if features.get("rms_mean", 0) > 0.01:
428
- score += 0.3
429
-
430
- # Spectral characteristics
431
- centroid = features.get("spectral_centroid_mean", 0)
432
- target_f2 = model.get("f2", 1500)
433
-
434
- # F2 approximation from spectral centroid
435
- f2_error = abs(centroid - target_f2) / target_f2
436
- f2_score = max(0, 1 - f2_error)
437
- score += 0.4 * f2_score
438
-
439
- # Stability (vowels should be stable)
440
- zcr = features.get("zcr_mean", 0)
441
- if zcr < 0.1: # Low zero crossing for vowels
442
- score += 0.3
443
-
444
- return score
445
-
446
- def _score_fricative(self, phoneme: str, features: Dict, model: Dict) -> float:
447
- """Score fricative phoneme"""
448
- score = 0.0
449
-
450
- # High frequency content for fricatives
451
- centroid = features.get("spectral_centroid_mean", 0)
452
- zcr = features.get("zcr_mean", 0)
453
-
454
- if model.get("very_high_freq"): # S, Z sounds
455
- if centroid > 3000:
456
- score += 0.4
457
- if zcr > 0.2:
458
- score += 0.4
459
- elif model.get("high_freq"): # F, V, TH, DH, SH, ZH
460
- if centroid > 1500:
461
- score += 0.4
462
- if zcr > 0.15:
463
- score += 0.3
464
-
465
- # Voicing check
466
- energy = features.get("rms_mean", 0)
467
- if model.get("voicing") and energy > 0.01: # Voiced fricatives
468
- score += 0.2
469
- elif not model.get("voicing") and energy < 0.05: # Voiceless fricatives
470
- score += 0.2
471
-
472
- return score
473
-
474
- def _score_stop(self, phoneme: str, features: Dict, model: Dict) -> float:
475
- """Score stop consonant"""
476
- score = 0.0
477
-
478
- # Burst energy
479
- energy = features.get("rms_mean", 0)
480
- burst_threshold = 0.02 if model.get("voicing") else 0.03
481
-
482
- if energy > burst_threshold:
483
- score += 0.6
484
-
485
- # Duration check
486
- # Stops should be relatively short
487
- score += 0.4 # Base score for presence
488
-
489
- return score
490
-
491
- def _score_other_consonant(
492
- self, phoneme: str, features: Dict, model: Dict
493
- ) -> float:
494
- """Score other consonant types"""
495
- score = 0.0
496
-
497
- energy = features.get("rms_mean", 0)
498
- centroid = features.get("spectral_centroid_mean", 0)
499
- zcr = features.get("zcr_mean", 0)
500
-
501
- if model["type"] == "liquid":
502
- # L/R sounds - moderate energy, specific spectral characteristics
503
- if 0.01 <= energy <= 0.08:
504
- score += 0.3
505
- if phoneme == "R" and centroid < 1800: # R lowers F3
506
- score += 0.4
507
- elif phoneme == "L" and 1200 <= centroid <= 2200:
508
- score += 0.4
509
- score += 0.3 # Base score
510
-
511
- elif model["type"] == "nasal":
512
- # Nasal sounds - good energy, specific spectral pattern
513
- if energy > 0.005:
514
- score += 0.4
515
- if 800 <= centroid <= 2000:
516
- score += 0.3
517
- score += 0.3
518
-
519
- elif model["type"] == "glide":
520
- # W/Y sounds - transition characteristics
521
- if energy > 0.005:
522
- score += 0.5
523
- score += 0.5
524
-
525
- elif model["type"] == "affricate":
526
- # CH/JH - combination of stop + fricative
527
- if energy > 0.02: # Burst component
528
- score += 0.3
529
- if zcr > 0.1: # Fricative component
530
- score += 0.4
531
- score += 0.3
532
-
533
- return score
534
-
535
- def _apply_context_adjustments(
536
- self, score: float, phoneme: str, context: Dict
537
- ) -> float:
538
- """Apply contextual adjustments"""
539
- # Position in word adjustments
540
- position = context.get("position", "middle")
541
-
542
- if position == "initial" and phoneme in ["TH", "DH"]:
543
- score *= 1.1 # Easier in initial position
544
- elif position == "final" and phoneme in ["T", "D", "K", "G"]:
545
- score *= 0.9 # Harder in final position (Vietnamese tendency to drop)
546
-
547
- # Surrounding phonemes
548
- prev_phoneme = context.get("prev_phoneme")
549
- next_phoneme = context.get("next_phoneme")
550
-
551
- # Consonant clusters (difficult for Vietnamese)
552
- if (
553
- prev_phoneme
554
- and prev_phoneme in ["S", "T", "K"]
555
- and phoneme in ["T", "K", "P"]
556
- ):
557
- score *= 0.8 # Consonant clusters are harder
558
-
559
- return score
560
-
561
-
562
  # =============================================================================
563
- # ENHANCED PRONUNCIATION ASSESSOR
564
  # =============================================================================
565
 
566
-
567
- class EnhancedPronunciationAssessor:
568
- """Enhanced assessor supporting any English word"""
569
-
570
  def __init__(self):
571
- self.phoneme_processor = EnhancedPhonemeProcessor()
572
- self.sample_rate = 16000
573
-
574
- def process_audio_file(self, file_path: str, reference_text: str) -> Dict:
575
- """Process audio file with enhanced phoneme analysis"""
576
-
577
- # Load and validate audio
578
- audio, sr = librosa.load(file_path, sr=self.sample_rate)
579
- duration = len(audio) / sr
580
- max_amplitude = np.max(np.abs(audio))
581
-
582
- # Audio quality analysis
583
- audio_info = self._analyze_audio_quality(audio, duration, max_amplitude)
584
-
585
- # Extract comprehensive features
586
- features = self._extract_comprehensive_features(audio)
587
-
588
- # Text analysis
589
- text_analysis = self._analyze_text(reference_text)
590
-
591
- # Pronunciation assessment
592
- pronunciation_analysis = self._assess_pronunciation(
593
- audio, features, reference_text, text_analysis
594
- )
595
-
596
- return {
597
- "audio_info": audio_info,
598
- "text_analysis": text_analysis,
599
- "pronunciation_analysis": pronunciation_analysis,
600
- "features": features,
601
- }
602
-
603
- def _analyze_audio_quality(
604
- self, audio: np.ndarray, duration: float, max_amplitude: float
605
- ) -> Dict:
606
- """Comprehensive audio quality analysis"""
607
- issues = []
608
- quality_score = 1.0
609
-
610
- # Duration checks
611
- if duration < 0.5:
612
- issues.append("too_short")
613
- quality_score *= 0.5
614
- elif duration > 30:
615
- issues.append("too_long")
616
- quality_score *= 0.8
617
-
618
- # Amplitude checks
619
- if max_amplitude < 0.005:
620
- issues.append("too_quiet")
621
- quality_score *= 0.6
622
- elif max_amplitude > 0.98:
623
- issues.append("clipped")
624
- quality_score *= 0.7
625
-
626
- # Noise analysis
627
- noise_floor = np.mean(np.abs(audio[: int(0.1 * len(audio))])) # First 100ms
628
- if noise_floor > 0.02:
629
- issues.append("noisy")
630
- quality_score *= 0.8
631
-
632
- # Signal-to-noise ratio
633
- signal_power = np.mean(audio**2)
634
- snr = 10 * np.log10(signal_power / (noise_floor**2 + 1e-10))
635
-
636
- return {
637
- "duration": duration,
638
- "max_amplitude": max_amplitude,
639
- "noise_floor": noise_floor,
640
- "snr": snr,
641
- "quality_score": quality_score,
642
- "issues": issues,
643
- "quality_status": "good" if not issues else ",".join(issues),
644
  }
645
-
646
- def _extract_comprehensive_features(self, audio: np.ndarray) -> Dict:
647
- """Extract comprehensive acoustic features"""
648
- features = {}
649
-
650
- # Basic features
651
- features["mfcc"] = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13)
652
- features["mfcc_mean"] = np.mean(features["mfcc"], axis=1).tolist()
653
-
654
- # Energy features
655
- rms = librosa.feature.rms(y=audio, hop_length=512)[0]
656
- features["rms"] = rms.tolist()
657
- features["rms_mean"] = float(np.mean(rms))
658
- features["rms_std"] = float(np.std(rms))
659
-
660
- # Spectral features
661
- spectral_centroid = librosa.feature.spectral_centroid(
662
- y=audio, sr=self.sample_rate
663
- )[0]
664
- features["spectral_centroid"] = spectral_centroid.tolist()
665
- features["spectral_centroid_mean"] = float(np.mean(spectral_centroid))
666
- features["spectral_centroid_std"] = float(np.std(spectral_centroid))
667
-
668
- # Additional spectral features
669
- spectral_bandwidth = librosa.feature.spectral_bandwidth(
670
- y=audio, sr=self.sample_rate
671
- )[0]
672
- features["spectral_bandwidth_mean"] = float(np.mean(spectral_bandwidth))
673
-
674
- spectral_rolloff = librosa.feature.spectral_rolloff(
675
- y=audio, sr=self.sample_rate
676
- )[0]
677
- features["spectral_rolloff_mean"] = float(np.mean(spectral_rolloff))
678
-
679
- # Zero crossing rate
680
- zcr = librosa.feature.zero_crossing_rate(audio, hop_length=512)[0]
681
- features["zcr"] = zcr.tolist()
682
- features["zcr_mean"] = float(np.mean(zcr))
683
- features["zcr_std"] = float(np.std(zcr))
684
-
685
- # Pitch analysis
686
- pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate)
687
- f0 = []
688
- for t in range(pitches.shape[1]):
689
- index = magnitudes[:, t].argmax()
690
- pitch = pitches[index, t]
691
- f0.append(
692
- float(pitch) if pitch > 80 else 0.0
693
- ) # Filter out very low frequencies
694
-
695
- features["f0"] = f0
696
- valid_f0 = [f for f in f0 if f > 0]
697
- features["f0_mean"] = float(np.mean(valid_f0)) if valid_f0 else 0.0
698
- features["f0_std"] = float(np.std(valid_f0)) if valid_f0 else 0.0
699
-
700
- # Formant estimation (simplified)
701
- features["formants"] = self._estimate_formants(audio)
702
-
703
- return features
704
-
705
-
706
-
707
- def _analyze_text(self, text: str) -> Dict:
708
- """Analyze reference text for phonemes and difficulty"""
709
- words = text.lower().strip().split()
710
- text_info = {
711
- "words": [],
712
- "total_phonemes": 0,
713
- "difficulty_score": 0,
714
- "challenging_sounds": [],
715
  }
716
-
717
- all_phonemes = []
718
-
719
- for word in words:
720
- word_info = self.phoneme_processor.get_word_phonemes(word)
721
-
722
- # Calculate word difficulty
723
- word_difficulty = self.phoneme_processor.get_difficulty_score(
724
- word_info.phonemes
725
- )
726
-
727
- # Find challenging phonemes
728
- challenging = []
729
- for phoneme in word_info.phonemes:
730
- clean_phoneme = re.sub(r"[0-9]", "", phoneme)
731
- difficulty = self.phoneme_processor.difficulty_map.get(clean_phoneme, 0)
732
- if difficulty > 0.6:
733
- challenging.append(clean_phoneme)
734
-
735
- word_data = {
736
- "word": word,
737
- "phonemes": word_info.phonemes,
738
- "ipa": word_info.ipa_transcription,
739
- "syllables": word_info.syllables,
740
- "difficulty": word_difficulty,
741
- "challenging_phonemes": challenging,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  }
 
 
 
 
 
 
 
 
 
743
 
744
- text_info["words"].append(word_data)
745
- all_phonemes.extend(word_info.phonemes)
746
- text_info["challenging_sounds"].extend(challenging)
747
-
748
- text_info["total_phonemes"] = len(all_phonemes)
749
- text_info["difficulty_score"] = self.phoneme_processor.get_difficulty_score(
750
- all_phonemes
751
- )
752
- text_info["challenging_sounds"] = list(
753
- set(text_info["challenging_sounds"])
754
- ) # Remove duplicates
755
-
756
- return text_info
757
-
758
- def _assess_pronunciation(
759
- self, audio: np.ndarray, features: Dict, text: str, text_analysis: Dict
760
- ) -> Dict:
761
- """Comprehensive pronunciation assessment"""
762
- words = text.lower().strip().split()
763
- word_segments = self._segment_words_advanced(audio, features, len(words))
764
-
765
- word_results = []
766
- phoneme_results = []
767
-
768
- for i, word in enumerate(words):
769
- if i < len(word_segments):
770
- word_audio = word_segments[i]
771
- word_info = text_analysis["words"][i]
772
-
773
- # Assess word
774
- word_result = self._assess_word_comprehensive(
775
- word_audio, word_info, features, i, len(words)
776
- )
777
-
778
- word_results.append(word_result)
779
- phoneme_results.extend(word_result["phoneme_details"])
780
-
781
- # Calculate overall metrics
782
- overall_score = (
783
- np.mean([wr["score"] for wr in word_results]) if word_results else 0.0
784
- )
785
-
786
- # Generate comprehensive feedback
787
- feedback = self._generate_comprehensive_feedback(
788
- word_results, text_analysis, features, overall_score
789
- )
790
 
791
- # Difficulty analysis
792
- difficulty_analysis = self._analyze_difficulty_performance(
793
- word_results, text_analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
794
  )
795
-
 
 
 
 
 
 
796
  return {
797
- "overall_score": overall_score,
798
- "words": word_results,
799
- "phoneme_details": phoneme_results,
800
- "feedback": feedback,
801
- "status": self._get_status(overall_score),
802
- "difficulty_analysis": difficulty_analysis,
803
  }
804
-
805
- def _segment_words_advanced(
806
- self, audio: np.ndarray, features: Dict, num_words: int
807
- ) -> List[np.ndarray]:
808
- """Advanced word segmentation using energy and spectral cues"""
809
- if num_words == 1:
810
- return [audio]
811
-
812
- # Use RMS energy to find word boundaries
813
- rms = features["rms"]
814
-
815
- # Find energy peaks (potential word centers)
816
- from scipy.signal import find_peaks
817
-
818
- # Smooth RMS for better peak detection
819
- window_size = min(5, len(rms) // 4)
820
- if window_size > 0:
821
- rms_smooth = np.convolve(
822
- rms, np.ones(window_size) / window_size, mode="same"
823
- )
824
- else:
825
- rms_smooth = rms
826
-
827
- peaks, _ = find_peaks(
828
- rms_smooth,
829
- height=np.mean(rms_smooth) * 0.5,
830
- distance=len(rms) // (num_words * 2),
831
- )
832
-
833
- # If we don't find enough peaks, fall back to equal division
834
- if len(peaks) < num_words:
835
- segment_length = len(audio) // num_words
836
- segments = []
837
- for i in range(num_words):
838
- start = i * segment_length
839
- end = start + segment_length if i < num_words - 1 else len(audio)
840
- segments.append(audio[start:end])
841
- return segments
842
-
843
- # Use peaks to define word boundaries
844
- hop_length = 512
845
- peak_times = librosa.frames_to_samples(peaks, hop_length=hop_length)
846
-
847
- segments = []
848
- for i in range(num_words):
849
- if i == 0:
850
- start = 0
851
- end = peak_times[min(i, len(peak_times) - 1)] + len(audio) // (
852
- num_words * 4
853
- )
854
- elif i == num_words - 1:
855
- start = peak_times[min(i - 1, len(peak_times) - 1)] - len(audio) // (
856
- num_words * 4
857
- )
858
- end = len(audio)
859
- else:
860
- start = peak_times[min(i - 1, len(peak_times) - 1)] - len(audio) // (
861
- num_words * 6
862
- )
863
- end = peak_times[min(i, len(peak_times) - 1)] + len(audio) // (
864
- num_words * 6
865
- )
866
-
867
- start = max(0, start)
868
- end = min(len(audio), end)
869
- segments.append(audio[start:end])
870
-
871
- return segments
872
-
873
- def _assess_word_comprehensive(
874
- self,
875
- word_audio: np.ndarray,
876
- word_info: Dict,
877
- global_features: Dict,
878
- word_index: int,
879
- total_words: int,
880
- ) -> Dict:
881
- """Comprehensive word assessment"""
882
- if len(word_audio) < 500:
883
- return {
884
- "word": word_info["word"],
885
- "score": 0.2,
886
- "status": "poor",
887
- "issues": ["too_short"],
888
- "phoneme_details": [],
889
  }
890
-
891
- # Extract word-level features
892
- word_features = self._extract_word_features(word_audio)
893
-
894
- # Assess each phoneme
895
- phonemes = word_info["phonemes"]
896
- phoneme_segments = self._segment_phonemes(word_audio, len(phonemes))
897
-
898
- phoneme_scores = []
899
- phoneme_details = []
900
-
901
- for i, (phoneme, segment) in enumerate(zip(phonemes, phoneme_segments)):
902
- if len(segment) > 100: # Minimum segment length
903
- segment_features = self._extract_segment_features(segment)
904
-
905
- # Context information
906
- context = {
907
- "position": (
908
- "initial"
909
- if i == 0
910
- else "final" if i == len(phonemes) - 1 else "middle"
911
- ),
912
- "prev_phoneme": phonemes[i - 1] if i > 0 else None,
913
- "next_phoneme": phonemes[i + 1] if i < len(phonemes) - 1 else None,
914
- "word_position": word_index / total_words,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
915
  }
916
-
917
- score = self.phoneme_processor.score_phoneme_advanced(
918
- phoneme, segment_features, context
919
- )
920
-
921
- phoneme_scores.append(score)
922
- phoneme_details.append(
923
- {
924
- "phoneme": phoneme,
925
- "score": score,
926
- "position": context["position"],
927
- "difficulty": self.phoneme_processor.difficulty_map.get(
928
- re.sub(r"[0-9]", "", phoneme), 0.3
929
- ),
930
- "word": word_info["word"],
931
- }
932
- )
933
-
934
- # Word-level score
935
- word_score = np.mean(phoneme_scores) if phoneme_scores else 0.0
936
-
937
- # Detect issues
938
- issues = []
939
- if word_score < 0.3:
940
- issues.append("very_poor_clarity")
941
- if word_features.get("rms_mean", 0) < 0.005:
942
- issues.append("too_quiet")
943
- if word_features.get("zcr_mean", 0) > 0.3:
944
- issues.append("too_noisy")
945
-
946
- return {
947
- "word": word_info["word"],
948
- "score": word_score,
949
- "status": self._get_word_status(word_score),
950
- "phonemes": phonemes,
951
- "phoneme_scores": phoneme_scores,
952
- "phoneme_details": phoneme_details,
953
- "ipa": word_info["ipa"],
954
- "syllables": word_info["syllables"],
955
- "difficulty": word_info["difficulty"],
956
- "issues": issues,
957
- }
958
-
959
- def _extract_word_features(self, word_audio: np.ndarray) -> Dict:
960
- """Extract features for word segment"""
961
- if len(word_audio) < 100:
962
- return {}
963
-
964
- mfcc = librosa.feature.mfcc(y=word_audio, sr=self.sample_rate, n_mfcc=13)
965
- rms = librosa.feature.rms(y=word_audio)[0]
966
- centroid = librosa.feature.spectral_centroid(y=word_audio, sr=self.sample_rate)[
967
- 0
968
- ]
969
- zcr = librosa.feature.zero_crossing_rate(word_audio)[0]
970
-
971
- return {
972
- "mfcc_mean": np.mean(mfcc, axis=1).tolist(),
973
- "rms_mean": float(np.mean(rms)),
974
- "spectral_centroid_mean": float(np.mean(centroid)),
975
- "zcr_mean": float(np.mean(zcr)),
976
- }
977
-
978
- def _segment_phonemes(
979
- self, word_audio: np.ndarray, num_phonemes: int
980
- ) -> List[np.ndarray]:
981
- """Segment word audio into phonemes"""
982
- if num_phonemes <= 1:
983
- return [word_audio]
984
-
985
- segment_length = len(word_audio) // num_phonemes
986
- segments = []
987
-
988
- for i in range(num_phonemes):
989
- start = i * segment_length
990
- end = start + segment_length if i < num_phonemes - 1 else len(word_audio)
991
- segments.append(word_audio[start:end])
992
-
993
- return segments
994
-
995
- def _extract_segment_features(self, segment: np.ndarray) -> Dict:
996
- """Extract features for phoneme segment"""
997
- if len(segment) < 50:
998
- return {}
999
-
1000
- # Basic features for short segments
1001
- rms_mean = float(np.mean(librosa.feature.rms(y=segment)[0]))
1002
- zcr_mean = float(np.mean(librosa.feature.zero_crossing_rate(segment)[0]))
1003
-
1004
- # Spectral centroid
1005
- centroid = librosa.feature.spectral_centroid(y=segment, sr=self.sample_rate)[0]
1006
- centroid_mean = float(np.mean(centroid))
1007
-
1008
- # MFCC for short segment
1009
- if len(segment) > 512:
1010
- mfcc = librosa.feature.mfcc(y=segment, sr=self.sample_rate, n_mfcc=5)
1011
- mfcc_mean = np.mean(mfcc, axis=1).tolist()
1012
- else:
1013
- mfcc_mean = [0] * 5
1014
-
1015
- return {
1016
- "rms_mean": rms_mean,
1017
- "zcr_mean": zcr_mean,
1018
- "spectral_centroid_mean": centroid_mean,
1019
- "mfcc_mean": mfcc_mean,
1020
- }
1021
-
1022
- def _generate_comprehensive_feedback(
1023
- self,
1024
- word_results: List[Dict],
1025
- text_analysis: Dict,
1026
- features: Dict,
1027
- overall_score: float,
1028
- ) -> List[str]:
1029
- """Generate comprehensive feedback"""
1030
- feedback = []
1031
-
1032
- # Overall performance feedback
1033
- if overall_score >= 0.85:
1034
- feedback.append(
1035
- "🎉 Outstanding pronunciation! You sound very natural and clear."
1036
- )
1037
- elif overall_score >= 0.7:
1038
- feedback.append(
1039
- "👍 Great job! Your pronunciation is quite good with room for minor improvements."
1040
- )
1041
- elif overall_score >= 0.5:
1042
- feedback.append(
1043
- "📚 Good progress! Keep practicing the areas highlighted below."
1044
- )
1045
- elif overall_score >= 0.3:
1046
- feedback.append(
1047
- "🔄 Keep working on it! Focus on clarity and the specific sounds mentioned."
1048
- )
1049
- else:
1050
- feedback.append(
1051
- "💪 Don't give up! Start with slower, clearer pronunciation."
1052
- )
1053
-
1054
- # Audio quality feedback
1055
- audio_quality = features.get("rms_mean", 0)
1056
- if audio_quality < 0.01:
1057
- feedback.append(
1058
- "🔊 Try speaking louder and more clearly - your recording was quite quiet."
1059
- )
1060
- elif audio_quality > 0.15:
1061
- feedback.append("🔉 Good volume level! Your voice comes through clearly.")
1062
-
1063
- # Pitch variation feedback
1064
- pitch_std = features.get("f0_std", 0)
1065
- if pitch_std < 20:
1066
- feedback.append(
1067
- "🎵 Try adding more natural pitch variation to sound more engaging."
1068
- )
1069
- elif pitch_std > 80:
1070
- feedback.append(
1071
- "🎵 Good pitch variation! Your speech sounds natural and expressive."
1072
- )
1073
-
1074
- # Word-specific feedback
1075
- poor_words = [wr for wr in word_results if wr["score"] < 0.5]
1076
- if poor_words:
1077
- word_names = [w["word"] for w in poor_words]
1078
- feedback.append(f"🎯 Focus extra practice on: {', '.join(word_names)}")
1079
-
1080
- # Phoneme-specific feedback for Vietnamese speakers
1081
- all_challenging = []
1082
- for word_result in word_results:
1083
- for phoneme_detail in word_result.get("phoneme_details", []):
1084
- if phoneme_detail["score"] < 0.5 and phoneme_detail["difficulty"] > 0.6:
1085
- all_challenging.append(phoneme_detail["phoneme"])
1086
-
1087
- if all_challenging:
1088
- unique_challenging = list(set(all_challenging))
1089
- vietnamese_tips = {
1090
- "TH": "Put your tongue between your teeth and blow air gently",
1091
- "DH": "Same tongue position as TH, but vibrate your vocal cords",
1092
- "V": "Touch your bottom lip to your top teeth, then voice",
1093
- "R": "Curl your tongue without touching the roof of your mouth",
1094
- "L": "Touch your tongue tip to the roof of your mouth",
1095
- "Z": "Like 'S' but with vocal cord vibration",
1096
- }
1097
-
1098
- for phoneme in unique_challenging[:3]: # Top 3 challenging
1099
- clean_phoneme = re.sub(r"[0-9]", "", phoneme)
1100
- if clean_phoneme in vietnamese_tips:
1101
- feedback.append(
1102
- f"🔤 {clean_phoneme} sound: {vietnamese_tips[clean_phoneme]}"
1103
- )
1104
-
1105
- # Difficulty-based encouragement
1106
- text_difficulty = text_analysis["difficulty_score"]
1107
- if text_difficulty > 0.7 and overall_score > 0.6:
1108
- feedback.append(
1109
- "💪 Impressive! You tackled some very challenging sounds for Vietnamese speakers."
1110
- )
1111
- elif text_difficulty < 0.3 and overall_score < 0.7:
1112
- feedback.append("📈 Try some more challenging words as you improve!")
1113
-
1114
- return feedback
1115
-
1116
- def _analyze_difficulty_performance(
1117
- self, word_results: List[Dict], text_analysis: Dict
1118
- ) -> Dict:
1119
- """Analyze performance vs difficulty"""
1120
- easy_phonemes = [] # difficulty < 0.4
1121
- medium_phonemes = [] # 0.4 <= difficulty < 0.7
1122
- hard_phonemes = [] # difficulty >= 0.7
1123
-
1124
- for word_result in word_results:
1125
- for phoneme_detail in word_result.get("phoneme_details", []):
1126
- difficulty = phoneme_detail["difficulty"]
1127
- score = phoneme_detail["score"]
1128
-
1129
- if difficulty < 0.4:
1130
- easy_phonemes.append(score)
1131
- elif difficulty < 0.7:
1132
- medium_phonemes.append(score)
1133
- else:
1134
- hard_phonemes.append(score)
1135
-
1136
- return {
1137
- "easy_sounds_avg": float(np.mean(easy_phonemes)) if easy_phonemes else 0.0,
1138
- "medium_sounds_avg": (
1139
- float(np.mean(medium_phonemes)) if medium_phonemes else 0.0
1140
- ),
1141
- "hard_sounds_avg": float(np.mean(hard_phonemes)) if hard_phonemes else 0.0,
1142
- "total_challenging_sounds": len(hard_phonemes),
1143
- "mastered_difficult_sounds": len([s for s in hard_phonemes if s > 0.7]),
1144
- "text_difficulty": text_analysis["difficulty_score"],
1145
- }
1146
-
1147
  def _get_word_status(self, score: float) -> str:
1148
  """Get word status from score"""
1149
  if score >= 0.8:
@@ -1154,475 +529,370 @@ class EnhancedPronunciationAssessor:
1154
  return "needs_practice"
1155
  else:
1156
  return "poor"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1157
 
1158
- def _get_status(self, score: float) -> str:
1159
- """Get overall status"""
1160
- return self._get_word_status(score)
1161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1162
 
1163
  # =============================================================================
1164
- # ENHANCED FASTAPI APP
1165
  # =============================================================================
1166
 
1167
-
1168
- # Initialize enhanced processor
1169
- assessor = EnhancedPronunciationAssessor()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1170
 
1171
  # =============================================================================
1172
- # ENHANCED ENDPOINTS
1173
  # =============================================================================
1174
 
1175
-
1176
- @router.post("/assess", response_model=PronunciationResult)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1177
  async def assess_pronunciation(
1178
- audio: UploadFile = File(..., description="Audio file"),
1179
- reference_text: str = Form(..., description="Any English text"),
1180
- difficulty_level: str = Form("medium", description="easy, medium, hard"),
1181
  ):
1182
  """
1183
- Assess pronunciation for ANY English text
1184
- Supports 60,000+ words from CMU Pronouncing Dictionary
 
 
 
 
 
 
 
 
1185
  """
1186
-
1187
  import time
1188
-
1189
  start_time = time.time()
1190
- print(f"Starting pronunciation assessment...")
1191
- print("Reference text:", reference_text)
1192
- print("Difficulty level:", difficulty_level)
1193
- print("Audio filename:", audio.filename if audio else "None")
1194
 
1195
  # Validate inputs
1196
  if not reference_text.strip():
1197
- print("Validation failed: Reference text is empty")
1198
  raise HTTPException(status_code=400, detail="Reference text cannot be empty")
1199
-
1200
- if len(reference_text) > 1000:
1201
- print("Validation failed: Reference text too long")
1202
- raise HTTPException(
1203
- status_code=400, detail="Reference text too long (max 1000 characters)"
1204
- )
1205
-
1206
- # Check if text contains only valid characters
1207
- # Updated regex to be more permissive and include common punctuation like commas
1208
  if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text):
1209
- print("Validation failed: Invalid characters in text")
1210
- print("Text that failed validation:", repr(reference_text))
1211
  raise HTTPException(
1212
  status_code=400,
1213
- detail="Text contains invalid characters. Only English letters, spaces, and basic punctuation (,.'-!?;:) allowed.",
1214
  )
1215
-
1216
  try:
1217
- # Save uploaded file
1218
- print("Saving uploaded file...")
1219
- # Handle cases where filename might be None or empty
1220
  file_extension = ".wav"
1221
- if audio.filename:
1222
- file_extension = f".{audio.filename.split('.')[-1]}" if '.' in audio.filename else ".wav"
1223
 
1224
- with tempfile.NamedTemporaryFile(
1225
- delete=False, suffix=file_extension
1226
- ) as tmp_file:
1227
  content = await audio.read()
1228
  tmp_file.write(content)
1229
  tmp_file.flush()
1230
- print("File saved to:", tmp_file.name)
1231
- print("File size:", len(content), "bytes")
1232
-
1233
- # Process with enhanced assessor
1234
- print("Processing audio file...")
1235
- result = assessor.process_audio_file(tmp_file.name, reference_text)
1236
- print("Audio processing completed")
1237
-
1238
- # Clean up
1239
- os.unlink(tmp_file.name)
1240
-
1241
- # Apply difficulty adjustments
1242
- analysis = result["pronunciation_analysis"]
1243
- if difficulty_level == "easy":
1244
- analysis["overall_score"] = min(1.0, analysis["overall_score"] * 1.2)
1245
- for word in analysis["words"]:
1246
- word["score"] = min(1.0, word["score"] * 1.2)
1247
- elif difficulty_level == "hard":
1248
- analysis["overall_score"] = analysis["overall_score"] * 0.8
1249
- for word in analysis["words"]:
1250
- word["score"] = word["score"] * 0.8
1251
-
1252
  processing_time = time.time() - start_time
1253
- print("Processing completed successfully in", processing_time, "seconds")
1254
-
1255
- return PronunciationResult(
1256
- overall_score=analysis["overall_score"],
1257
- status=analysis["status"],
1258
- feedback=analysis["feedback"],
1259
- words=analysis["words"],
1260
- phoneme_details=analysis["phoneme_details"],
1261
- audio_info=result["audio_info"],
1262
- processing_time=processing_time,
1263
- difficulty_analysis=analysis["difficulty_analysis"],
1264
- )
1265
-
1266
  except Exception as e:
1267
- print("Exception occurred during processing:", str(e))
1268
  import traceback
1269
  traceback.print_exc()
1270
- raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
1271
 
 
 
 
1272
 
1273
  @router.get("/phonemes/{word}")
1274
  async def get_word_phonemes(word: str):
1275
- """Get comprehensive phoneme information for ANY English word"""
1276
  try:
1277
- word_info = assessor.phoneme_processor.get_word_phonemes(word)
1278
-
1279
- # Calculate difficulty for Vietnamese speakers
1280
- difficulty = assessor.phoneme_processor.get_difficulty_score(word_info.phonemes)
1281
-
1282
- # Get challenging phonemes
1283
- challenging_phonemes = []
1284
- for phoneme in word_info.phonemes:
1285
- clean_phoneme = re.sub(r"[0-9]", "", phoneme)
1286
- phoneme_difficulty = assessor.phoneme_processor.difficulty_map.get(
1287
- clean_phoneme, 0
1288
- )
1289
- if phoneme_difficulty > 0.6:
1290
- challenging_phonemes.append(
1291
- {
1292
- "phoneme": clean_phoneme,
1293
- "difficulty": phoneme_difficulty,
1294
- "tips": get_phoneme_tips(clean_phoneme),
1295
- }
1296
- )
1297
-
1298
  return {
1299
  "word": word,
1300
- "phonemes": word_info.phonemes,
1301
- "ipa_transcription": word_info.ipa_transcription,
1302
- "syllables": word_info.syllables,
1303
- "stress_pattern": word_info.stress_pattern,
1304
- "difficulty_score": difficulty,
1305
- "difficulty_level": (
1306
- "hard" if difficulty > 0.7 else "medium" if difficulty > 0.4 else "easy"
1307
- ),
1308
- "challenging_phonemes": challenging_phonemes,
1309
- "pronunciation_tips": get_word_pronunciation_tips(word, word_info.phonemes),
 
 
 
 
1310
  }
1311
-
1312
  except Exception as e:
1313
- raise HTTPException(status_code=500, detail=f"Error processing word: {str(e)}")
1314
 
1315
-
1316
- @router.post("/analyze/text")
1317
- async def analyze_text_difficulty(text: str = Form(...)):
1318
- """Analyze pronunciation difficulty of any English text"""
1319
  try:
1320
- text_analysis = assessor._analyze_text(text)
1321
-
1322
- return {
1323
- "text": text,
1324
- "word_count": len(text_analysis["words"]),
1325
- "total_phonemes": text_analysis["total_phonemes"],
1326
- "overall_difficulty": text_analysis["difficulty_score"],
1327
- "difficulty_level": (
1328
- "hard"
1329
- if text_analysis["difficulty_score"] > 0.7
1330
- else "medium" if text_analysis["difficulty_score"] > 0.4 else "easy"
1331
- ),
1332
- "challenging_sounds": text_analysis["challenging_sounds"],
1333
- "word_breakdown": text_analysis["words"],
1334
- "recommendations": get_text_recommendations(text_analysis),
1335
  }
1336
-
1337
  except Exception as e:
1338
- raise HTTPException(status_code=500, detail=f"Text analysis error: {str(e)}")
1339
-
 
 
1340
 
1341
- @router.get("/dictionary/search")
1342
- async def search_dictionary(query: str, limit: int = 20):
1343
- """Search CMU dictionary for words containing query"""
1344
  try:
1345
- cmu_dict = assessor.phoneme_processor.cmu_dict
1346
-
1347
- # Search for words containing the query
1348
- matching_words = []
1349
- query_lower = query.lower()
1350
-
1351
- for word in cmu_dict.keys():
1352
- if query_lower in word and len(matching_words) < limit:
1353
- word_info = assessor.phoneme_processor.get_word_phonemes(word)
1354
- difficulty = assessor.phoneme_processor.get_difficulty_score(
1355
- word_info.phonemes
1356
- )
1357
-
1358
- matching_words.append(
1359
- {
1360
- "word": word,
1361
- "phonemes": word_info.phonemes,
1362
- "ipa": word_info.ipa_transcription,
1363
- "difficulty": difficulty,
1364
- "difficulty_level": (
1365
- "hard"
1366
- if difficulty > 0.7
1367
- else "medium" if difficulty > 0.4 else "easy"
1368
- ),
1369
- }
1370
- )
1371
-
1372
- # Sort by difficulty (easiest first)
1373
- matching_words.sort(key=lambda x: x["difficulty"])
1374
-
1375
- return {"query": query, "found": len(matching_words), "words": matching_words}
1376
-
1377
  except Exception as e:
1378
- raise HTTPException(
1379
- status_code=500, detail=f"Dictionary search error: {str(e)}"
1380
- )
1381
-
1382
-
1383
- @router.get("/practice/level/{level}")
1384
- async def get_practice_words(level: str, count: int = 10):
1385
- """Get practice words by difficulty level"""
1386
-
1387
- if level not in ["easy", "medium", "hard"]:
1388
- raise HTTPException(
1389
- status_code=400, detail="Level must be easy, medium, or hard"
1390
- )
1391
-
1392
- try:
1393
- cmu_dict = assessor.phoneme_processor.cmu_dict
1394
- practice_words = []
1395
-
1396
- # Define difficulty ranges
1397
- if level == "easy":
1398
- difficulty_range = (0, 0.4)
1399
- elif level == "medium":
1400
- difficulty_range = (0.4, 0.7)
1401
- else: # hard
1402
- difficulty_range = (0.7, 1.0)
1403
-
1404
- # Sample words from dictionary
1405
- word_list = list(cmu_dict.keys())
1406
- np.random.shuffle(word_list)
1407
-
1408
- for word in word_list:
1409
- if len(practice_words) >= count:
1410
- break
1411
-
1412
- # Skip very short or very long words
1413
- if len(word) < 3 or len(word) > 12:
1414
- continue
1415
-
1416
- # Skip words with special characters
1417
- if not word.isalpha():
1418
- continue
1419
-
1420
- word_info = assessor.phoneme_processor.get_word_phonemes(word)
1421
- difficulty = assessor.phoneme_processor.get_difficulty_score(
1422
- word_info.phonemes
1423
- )
1424
-
1425
- if difficulty_range[0] <= difficulty <= difficulty_range[1]:
1426
- practice_words.append(
1427
- {
1428
- "word": word,
1429
- "phonemes": word_info.phonemes,
1430
- "ipa": word_info.ipa_transcription,
1431
- "difficulty": difficulty,
1432
- "tips": get_word_pronunciation_tips(word, word_info.phonemes),
1433
- }
1434
- )
1435
-
1436
  return {
1437
- "level": level,
1438
- "difficulty_range": difficulty_range,
1439
- "count": len(practice_words),
1440
- "words": practice_words,
1441
  }
1442
 
1443
- except Exception as e:
1444
- raise HTTPException(status_code=500, detail=f"Practice words error: {str(e)}")
1445
-
1446
-
1447
  # =============================================================================
1448
  # HELPER FUNCTIONS
1449
  # =============================================================================
1450
 
1451
-
1452
- def get_phoneme_tips(phoneme: str) -> List[str]:
1453
- """Get pronunciation tips for specific phonemes"""
1454
- tips_dict = {
1455
- "TH": [
1456
- "Place tongue tip between upper and lower teeth",
1457
- "Blow air gently while keeping tongue in position",
1458
- "Should feel air flowing over tongue",
1459
- ],
1460
- "DH": [
1461
- "Same tongue position as TH",
1462
- "Add vocal cord vibration",
1463
- "Should feel buzzing in throat",
1464
- ],
1465
- "V": [
1466
- "Touch bottom lip to upper teeth",
1467
- "Voice while air flows through the gap",
1468
- "Don't use both lips like Vietnamese 'V'",
1469
- ],
1470
- "R": [
1471
- "Curl tongue without touching roof of mouth",
1472
- "Don't roll the R like in Vietnamese",
1473
- "Tongue should float freely",
1474
- ],
1475
- "L": [
1476
- "Touch tongue tip to roof of mouth behind teeth",
1477
- "Let air flow around sides of tongue",
1478
- "Make sure tongue actually touches",
1479
- ],
1480
- "Z": [
1481
- "Same tongue position as 'S'",
1482
- "Add vocal cord vibration",
1483
- "Should buzz like a bee",
1484
- ],
1485
  }
1486
-
1487
- return tips_dict.get(phoneme, ["Practice this sound slowly and clearly"])
1488
-
1489
-
1490
- def get_word_pronunciation_tips(word: str, phonemes: List[str]) -> List[str]:
1491
- """Get word-specific pronunciation tips"""
1492
- tips = []
1493
-
1494
- # Check for challenging combinations
1495
- phoneme_str = " ".join(phonemes)
1496
-
1497
- # Consonant clusters
1498
- if "S T" in phoneme_str or "S K" in phoneme_str or "S P" in phoneme_str:
1499
- tips.append("Practice the consonant cluster slowly, then speed up")
1500
-
1501
- # TH sounds
1502
- if "TH" in phonemes:
1503
- tips.append("Remember: tongue between teeth for TH sounds")
1504
-
1505
- # R and L distinction
1506
- if "R" in phonemes and "L" in phonemes:
1507
- tips.append("Focus on R (no touching) vs L (tongue touches roof)")
1508
-
1509
- # Final consonants (Vietnamese tendency to drop)
1510
- final_phoneme = phonemes[-1] if phonemes else ""
1511
- if final_phoneme in ["T", "D", "K", "G", "P", "B"]:
1512
- tips.append("Don't forget the final consonant sound")
1513
-
1514
- # Vowel length
1515
- vowel_phonemes = [
1516
- p for p in phonemes if re.sub(r"[0-9]", "", p) in ["IY", "UW", "AO"]
1517
- ]
1518
- if vowel_phonemes:
1519
- tips.append("Make sure long vowels are actually longer")
1520
-
1521
- if not tips:
1522
- tips.append("Break the word into syllables and practice each part")
1523
-
1524
- return tips
1525
-
1526
-
1527
- def get_text_recommendations(text_analysis: Dict) -> List[str]:
1528
- """Get recommendations based on text analysis"""
1529
- recommendations = []
1530
-
1531
- difficulty = text_analysis["difficulty_score"]
1532
-
1533
- if difficulty < 0.3:
1534
- recommendations.append(
1535
- "This text is good for beginners. Try adding more challenging words gradually."
1536
- )
1537
- elif difficulty > 0.8:
1538
- recommendations.append(
1539
- "This is very challenging text. Consider starting with easier words first."
1540
- )
1541
-
1542
- challenging_sounds = text_analysis["challenging_sounds"]
1543
- if len(challenging_sounds) > 5:
1544
- recommendations.append(
1545
- "This text has many challenging sounds. Practice individual words first."
1546
- )
1547
-
1548
- # Word length recommendations
1549
- long_words = [w for w in text_analysis["words"] if len(w["phonemes"]) > 8]
1550
- if long_words:
1551
- recommendations.append(
1552
- "Break down longer words into syllables for easier practice."
1553
- )
1554
-
1555
- return recommendations
1556
-
1557
-
1558
- # =============================================================================
1559
- # ADDITIONAL ENDPOINTS
1560
- # =============================================================================
1561
-
1562
-
1563
- @router.get("/stats")
1564
- async def get_system_stats():
1565
- """Get system statistics"""
1566
- cmu_dict = assessor.phoneme_processor.cmu_dict
1567
-
1568
- return {
1569
- "total_words_supported": len(cmu_dict),
1570
- "phonemes_supported": len(assessor.phoneme_processor.phoneme_models),
1571
- "difficulty_levels": ["easy", "medium", "hard"],
1572
- "audio_formats_supported": ["wav", "mp3", "m4a", "flac"],
1573
- "max_audio_duration": "30 seconds",
1574
- "vietnamese_specific_features": True,
1575
- "features": [
1576
- "CMU Pronouncing Dictionary integration",
1577
- "IPA transcription",
1578
- "Syllable analysis",
1579
- "Contextual phoneme scoring",
1580
- "Vietnamese learner optimization",
1581
- ],
1582
- }
1583
-
1584
-
1585
- @router.get("/phonemes/difficult")
1586
- async def get_difficult_phonemes_for_vietnamese():
1587
- """Get phonemes that are most difficult for Vietnamese speakers"""
1588
- difficult_phonemes = []
1589
-
1590
- for phoneme, difficulty in assessor.phoneme_processor.difficulty_map.items():
1591
- if difficulty > 0.6: # Only include challenging ones
1592
- difficult_phonemes.append(
1593
- {
1594
- "phoneme": phoneme,
1595
- "difficulty": difficulty,
1596
- "tips": get_phoneme_tips(phoneme),
1597
- "example_words": get_example_words(phoneme),
1598
- }
1599
- )
1600
-
1601
- # Sort by difficulty (hardest first)
1602
- difficult_phonemes.sort(key=lambda x: x["difficulty"], reverse=True)
1603
-
1604
- return {
1605
- "difficult_phonemes": difficult_phonemes,
1606
- "total_count": len(difficult_phonemes),
1607
- "recommendation": "Focus on the top 5 most difficult sounds first",
1608
- }
1609
-
1610
-
1611
- def get_example_words(phoneme: str) -> List[str]:
1612
- """Get example words containing the phoneme"""
1613
- examples = {
1614
- "TH": ["think", "three", "math", "path"],
1615
- "DH": ["this", "that", "mother", "weather"],
1616
- "V": ["very", "love", "give", "have"],
1617
- "Z": ["zoo", "zero", "buzz", "rise"],
1618
- "R": ["red", "car", "very", "right"],
1619
- "L": ["love", "hello", "well", "people"],
1620
- "W": ["water", "well", "what", "sweet"],
1621
- "ZH": ["measure", "vision", "treasure"],
1622
- "CH": ["chair", "much", "teach"],
1623
- "JH": ["job", "bridge", "age"],
1624
- "SH": ["shoe", "fish", "nation"],
1625
- "NG": ["ring", "thing", "young"],
1626
- }
1627
-
1628
- return examples.get(phoneme, [f"word_with_{phoneme.lower()}"])
 
1
+ # PRONUNCIATION ASSESSMENT USING WAV2VEC2PHONEME
2
+ # Input: Audio + Reference Text Output: Word highlights + Phoneme diff + Wrong words
3
+ # Uses Wav2Vec2Phoneme for accurate phoneme-level transcription without language model correction
4
 
5
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException, APIRouter
6
  from fastapi.middleware.cors import CORSMiddleware
7
  from pydantic import BaseModel
8
+ from typing import List, Dict, Optional
9
  import tempfile
10
  import os
11
  import numpy as np
12
  import librosa
13
  import nltk
14
  import eng_to_ipa as ipa
15
+ import torch
 
 
16
  import re
17
  from collections import defaultdict
18
  import warnings
19
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2PhonemeCTCTokenizer
20
 
21
  warnings.filterwarnings("ignore")
22
 
23
  # Download required NLTK data
24
  try:
25
  nltk.download("cmudict", quiet=True)
 
26
  from nltk.corpus import cmudict
27
  except:
28
  print("Warning: NLTK data not available")
 
30
  # =============================================================================
31
  # MODELS
32
  # =============================================================================
 
33
 
34
+ router = APIRouter(prefix="/pronunciation", tags=["Pronunciation"])
35
 
36
+ class PronunciationAssessmentResult(BaseModel):
37
+ transcript: str # What the user actually said (character transcript)
38
+ transcript_phonemes: str # User's phonemes
39
+ user_phonemes: str # Alias for transcript_phonemes for UI clarity
40
+ character_transcript: str
41
  overall_score: float
42
+ word_highlights: List[Dict]
43
+ phoneme_differences: List[Dict]
44
+ wrong_words: List[Dict]
45
  feedback: List[str]
46
+ processing_info: Dict
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  # =============================================================================
49
+ # WAV2VEC2 PHONEME ASR
50
  # =============================================================================
51
 
52
+ class Wav2Vec2CharacterASR:
53
+ """Wav2Vec2 character-level ASR without language model correction"""
54
+
55
+ def __init__(self, model_name: str = "facebook/wav2vec2-base-960h"):
56
+ """
57
+ Initialize Wav2Vec2 character-level model
58
+ Available models:
59
+ - facebook/wav2vec2-large-960h-lv60-self (character-level, no LM)
60
+ - facebook/wav2vec2-base-960h (character-level, no LM)
61
+ - facebook/wav2vec2-large-960h (character-level, no LM)
62
+ """
63
+ print(f"Loading Wav2Vec2 character model: {model_name}")
64
+
65
+ try:
66
+ self.processor = Wav2Vec2Processor.from_pretrained(model_name)
67
+ self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
68
+ self.model.eval()
69
+ print("Wav2Vec2 character model loaded successfully")
70
+ self.model_name = model_name
71
+ except Exception as e:
72
+ print(f"Error loading model {model_name}: {e}")
73
+ # Fallback to base model
74
+ fallback_model = "facebook/wav2vec2-base-960h"
75
+ print(f"Trying fallback model: {fallback_model}")
76
+ try:
77
+ self.processor = Wav2Vec2Processor.from_pretrained(fallback_model)
78
+ self.model = Wav2Vec2ForCTC.from_pretrained(fallback_model)
79
+ self.model.eval()
80
+ self.model_name = fallback_model
81
+ print("Fallback model loaded successfully")
82
+ except Exception as e2:
83
+ raise Exception(f"Failed to load both models. Original error: {e}, Fallback error: {e2}")
84
+
85
+ self.sample_rate = 16000
86
+
87
+ def transcribe_to_characters(self, audio_path: str) -> Dict:
88
+ """
89
+ Transcribe audio directly to characters (no language model correction)
90
+ Returns raw character sequence as produced by the model
91
+ """
92
+ try:
93
+ # Load audio
94
+ speech, sr = librosa.load(audio_path, sr=self.sample_rate)
95
+
96
+ # Prepare input
97
+ input_values = self.processor(
98
+ speech,
99
+ sampling_rate=self.sample_rate,
100
+ return_tensors="pt"
101
+ ).input_values
102
+
103
+ # Get model predictions (no language model involved)
104
+ with torch.no_grad():
105
+ logits = self.model(input_values).logits
106
+ predicted_ids = torch.argmax(logits, dim=-1)
107
+
108
+ # Decode to characters directly
109
+ character_transcript = self.processor.batch_decode(predicted_ids)[0]
110
+
111
+ # Clean up character transcript
112
+ character_transcript = self._clean_character_transcript(character_transcript)
113
+
114
+ # Convert characters to phoneme-like representation
115
+ phoneme_like_transcript = self._characters_to_phoneme_representation(character_transcript)
116
+
117
+ return {
118
+ "character_transcript": character_transcript,
119
+ "phoneme_representation": phoneme_like_transcript,
120
+ "raw_predicted_ids": predicted_ids[0].tolist(),
121
+ "confidence_scores": torch.softmax(logits, dim=-1).max(dim=-1)[0][0].tolist()[:100] # Limit for JSON
122
+ }
123
+
124
+ except Exception as e:
125
+ print(f"Transcription error: {e}")
126
+ return {
127
+ "character_transcript": "",
128
+ "phoneme_representation": "",
129
+ "raw_predicted_ids": [],
130
+ "confidence_scores": []
131
+ }
132
+
133
+ def _clean_character_transcript(self, transcript: str) -> str:
134
+ """Clean and standardize character transcript"""
135
+ # Remove extra spaces and special tokens
136
+ cleaned = re.sub(r'\s+', ' ', transcript)
137
+ cleaned = cleaned.strip().lower()
138
+
139
+ return cleaned
140
+
141
+ def _characters_to_phoneme_representation(self, text: str) -> str:
142
+ """Convert character-based transcript to phoneme-like representation for comparison"""
143
+ # This is a simple character-to-phoneme mapping for pronunciation comparison
144
+ # The idea is to convert the raw character output to something comparable with reference phonemes
145
+
146
+ if not text:
147
+ return ""
148
+
149
+ words = text.split()
150
+ phoneme_words = []
151
+
152
+ # Use our G2P to convert transcript words to phonemes
153
+ g2p = SimpleG2P()
154
+
155
+ for word in words:
156
+ try:
157
+ word_data = g2p.text_to_phonemes(word)[0]
158
+ phoneme_words.extend(word_data["phonemes"])
159
+ except:
160
+ # Fallback: simple letter-to-sound mapping
161
+ phoneme_words.extend(self._simple_letter_to_phoneme(word))
162
+
163
+ return " ".join(phoneme_words)
164
+
165
+ def _simple_letter_to_phoneme(self, word: str) -> List[str]:
166
+ """Simple fallback letter-to-phoneme conversion"""
167
+ letter_to_phoneme = {
168
+ 'a': 'æ', 'b': 'b', 'c': 'k', 'd': 'd', 'e': 'ɛ',
169
+ 'f': 'f', 'g': 'ɡ', 'h': 'h', 'i': 'ɪ', 'j': 'dʒ',
170
+ 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'o': 'ʌ',
171
+ 'p': 'p', 'q': 'k', 'r': 'r', 's': 's', 't': 't',
172
+ 'u': 'ʌ', 'v': 'v', 'w': 'w', 'x': 'ks', 'y': 'j', 'z': 'z'
173
+ }
174
+
175
+ phonemes = []
176
+ for letter in word.lower():
177
+ if letter in letter_to_phoneme:
178
+ phonemes.append(letter_to_phoneme[letter])
179
+
180
+ return phonemes
181
 
182
+ # =============================================================================
183
+ # SIMPLE G2P FOR REFERENCE
184
+ # =============================================================================
185
 
186
+ class SimpleG2P:
187
+ """Simple Grapheme-to-Phoneme converter for reference text"""
188
+
189
  def __init__(self):
 
 
 
190
  try:
191
  self.cmu_dict = cmudict.dict()
192
  except:
193
  self.cmu_dict = {}
194
  print("Warning: CMU dictionary not available")
195
+
196
+ def text_to_phonemes(self, text: str) -> List[Dict]:
197
+ """Convert text to phoneme sequence"""
198
+ words = self._clean_text(text).split()
199
+ phoneme_sequence = []
200
+
201
+ for word in words:
202
+ word_phonemes = self._get_word_phonemes(word)
203
+ phoneme_sequence.append({
204
+ "word": word,
205
+ "phonemes": word_phonemes,
206
+ "ipa": self._get_ipa(word),
207
+ "phoneme_string": " ".join(word_phonemes)
208
+ })
209
+
210
+ return phoneme_sequence
211
+
212
+ def get_reference_phoneme_string(self, text: str) -> str:
213
+ """Get reference phoneme string for comparison"""
214
+ phoneme_sequence = self.text_to_phonemes(text)
215
+ all_phonemes = []
216
+
217
+ for word_data in phoneme_sequence:
218
+ all_phonemes.extend(word_data["phonemes"])
219
+
220
+ return " ".join(all_phonemes)
221
+
222
+ def _clean_text(self, text: str) -> str:
223
+ """Clean text for processing"""
224
+ text = re.sub(r"[^\w\s\']", " ", text)
225
+ text = re.sub(r"\s+", " ", text)
226
+ return text.lower().strip()
227
+
228
+ def _get_word_phonemes(self, word: str) -> List[str]:
229
+ """Get phonemes for a word"""
230
+ word_lower = word.lower()
231
+
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  if word_lower in self.cmu_dict:
233
+ # Remove stress markers and convert to Wav2Vec2 phoneme format
234
+ phonemes = self.cmu_dict[word_lower][0]
235
+ clean_phonemes = [re.sub(r"[0-9]", "", p) for p in phonemes]
236
+ return self._convert_to_wav2vec_format(clean_phonemes)
237
+ else:
238
+ return self._estimate_phonemes(word)
239
+
240
+ def _convert_to_wav2vec_format(self, cmu_phonemes: List[str]) -> List[str]:
241
+ """Convert CMU phonemes to Wav2Vec2 format"""
242
+ # Mapping from CMU to Wav2Vec2/eSpeak phonemes
243
+ cmu_to_espeak = {
244
+ "AA": "ɑ", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ",
245
+ "AY": "aɪ", "EH": "ɛ", "ER": "ɝ", "EY": "eɪ", "IH": "ɪ",
246
+ "IY": "i", "OW": "oʊ", "OY": "ɔɪ", "UH": "ʊ", "UW": "u",
247
+ "B": "b", "CH": "tʃ", "D": "d", "DH": "ð", "F": "f",
248
+ "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k", "L": "l",
249
+ "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "r",
250
+ "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v",
251
+ "W": "w", "Y": "j", "Z": "z", "ZH": "ʒ"
252
+ }
253
+
254
+ converted = []
255
+ for phoneme in cmu_phonemes:
256
+ converted_phoneme = cmu_to_espeak.get(phoneme, phoneme.lower())
257
+ converted.append(converted_phoneme)
258
+
259
+ return converted
260
+
261
+ def _get_ipa(self, word: str) -> str:
262
+ """Get IPA transcription"""
263
  try:
264
+ return ipa.convert(word)
 
 
 
 
 
 
 
 
 
265
  except:
266
+ return f"/{word}/"
267
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  def _estimate_phonemes(self, word: str) -> List[str]:
269
  """Estimate phonemes for unknown words"""
270
+ # Basic phoneme estimation with eSpeak-style output
271
  phoneme_map = {
272
+ "ch": [""], "sh": ["ʃ"], "th": ["θ"], "ph": ["f"],
273
+ "ck": ["k"], "ng": ["ŋ"], "qu": ["k", "w"],
274
+ "a": ["æ"], "e": ["ɛ"], "i": ["ɪ"], "o": ["ʌ"], "u": ["ʌ"],
275
+ "b": ["b"], "c": ["k"], "d": ["d"], "f": ["f"], "g": ["ɡ"],
276
+ "h": ["h"], "j": ["dʒ"], "k": ["k"], "l": ["l"], "m": ["m"],
277
+ "n": ["n"], "p": ["p"], "r": ["r"], "s": ["s"], "t": ["t"],
278
+ "v": ["v"], "w": ["w"], "x": ["k", "s"], "y": ["j"], "z": ["z"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  }
280
+
281
  word = word.lower()
282
  phonemes = []
283
  i = 0
284
+
285
  while i < len(word):
286
  # Check 2-letter combinations first
287
+ if i <= len(word) - 2:
288
+ two_char = word[i:i+2]
289
  if two_char in phoneme_map:
290
  phonemes.extend(phoneme_map[two_char])
291
  i += 2
292
  continue
293
+
294
  # Single character
295
  char = word[i]
296
  if char in phoneme_map:
297
  phonemes.extend(phoneme_map[char])
298
+
299
  i += 1
300
+
301
  return phonemes
302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  # =============================================================================
304
+ # PHONEME COMPARATOR
305
  # =============================================================================
306
 
307
+ class PhonemeComparator:
308
+ """Compare reference and learner phoneme sequences"""
309
+
 
310
  def __init__(self):
311
+ # Vietnamese speakers' common phoneme substitutions
312
+ self.substitution_patterns = {
313
+ "θ": ["f", "s", "t"], # TH → F, S, T
314
+ "ð": ["d", "z", "v"], # DH D, Z, V
315
+ "v": ["w", "f"], # V W, F
316
+ "r": ["l"], # R → L
317
+ "l": ["r"], # L R
318
+ "z": ["s"], # Z S
319
+ "ʒ": ["ʃ", "z"], # ZH → SH, Z
320
+ "ŋ": ["n"], # NG → N
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  }
322
+
323
+ # Difficulty levels for Vietnamese speakers
324
+ self.difficulty_map = {
325
+ "θ": 0.9, # th (think)
326
+ "ð": 0.9, # th (this)
327
+ "v": 0.8, # v
328
+ "z": 0.8, # z
329
+ "ʒ": 0.9, # zh (measure)
330
+ "r": 0.7, # r
331
+ "l": 0.6, # l
332
+ "w": 0.5, # w
333
+ "f": 0.4, # f
334
+ "s": 0.3, # s
335
+ "ʃ": 0.5, # sh
336
+ "tʃ": 0.4, # ch
337
+ "dʒ": 0.5, # j
338
+ "ŋ": 0.3, # ng
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  }
340
+
341
+ def compare_phoneme_sequences(self, reference_phonemes: str,
342
+ learner_phonemes: str) -> List[Dict]:
343
+ """Compare reference and learner phoneme sequences"""
344
+
345
+ # Split phoneme strings
346
+ ref_phones = reference_phonemes.split()
347
+ learner_phones = learner_phonemes.split()
348
+
349
+ print(f"Reference phonemes: {ref_phones}")
350
+ print(f"Learner phonemes: {learner_phones}")
351
+
352
+ # Simple alignment comparison
353
+ comparisons = []
354
+ max_len = max(len(ref_phones), len(learner_phones))
355
+
356
+ for i in range(max_len):
357
+ ref_phoneme = ref_phones[i] if i < len(ref_phones) else ""
358
+ learner_phoneme = learner_phones[i] if i < len(learner_phones) else ""
359
+
360
+ if ref_phoneme and learner_phoneme:
361
+ # Both present - check accuracy
362
+ if ref_phoneme == learner_phoneme:
363
+ status = "correct"
364
+ score = 1.0
365
+ elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
366
+ status = "acceptable"
367
+ score = 0.7
368
+ else:
369
+ status = "wrong"
370
+ score = 0.2
371
+
372
+ elif ref_phoneme and not learner_phoneme:
373
+ # Missing phoneme
374
+ status = "missing"
375
+ score = 0.0
376
+
377
+ elif learner_phoneme and not ref_phoneme:
378
+ # Extra phoneme
379
+ status = "extra"
380
+ score = 0.0
381
+ else:
382
+ continue
383
+
384
+ comparison = {
385
+ "position": i,
386
+ "reference_phoneme": ref_phoneme,
387
+ "learner_phoneme": learner_phoneme,
388
+ "status": status,
389
+ "score": score,
390
+ "difficulty": self.difficulty_map.get(ref_phoneme, 0.3)
391
  }
392
+
393
+ comparisons.append(comparison)
394
+
395
+ return comparisons
396
+
397
+ def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
398
+ """Check if learner phoneme is acceptable substitution for Vietnamese speakers"""
399
+ acceptable = self.substitution_patterns.get(reference, [])
400
+ return learner in acceptable
401
 
402
+ # =============================================================================
403
+ # WORD ANALYZER
404
+ # =============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
+ class WordAnalyzer:
407
+ """Analyze word-level pronunciation accuracy using character-based ASR"""
408
+
409
+ def __init__(self):
410
+ self.g2p = SimpleG2P()
411
+ self.comparator = PhonemeComparator()
412
+
413
+ def analyze_words(self, reference_text: str, learner_phonemes: str) -> Dict:
414
+ """Analyze word-level pronunciation using phoneme representation from character ASR"""
415
+
416
+ # Get reference phonemes by word
417
+ reference_words = self.g2p.text_to_phonemes(reference_text)
418
+
419
+ # Get overall phoneme comparison
420
+ reference_phoneme_string = self.g2p.get_reference_phoneme_string(reference_text)
421
+ phoneme_comparisons = self.comparator.compare_phoneme_sequences(
422
+ reference_phoneme_string, learner_phonemes
423
  )
424
+
425
+ # Map phonemes back to words
426
+ word_highlights = self._create_word_highlights(reference_words, phoneme_comparisons)
427
+
428
+ # Identify wrong words
429
+ wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)
430
+
431
  return {
432
+ "word_highlights": word_highlights,
433
+ "phoneme_differences": phoneme_comparisons,
434
+ "wrong_words": wrong_words
 
 
 
435
  }
436
+
437
+ def _create_word_highlights(self, reference_words: List[Dict],
438
+ phoneme_comparisons: List[Dict]) -> List[Dict]:
439
+ """Create word highlighting data"""
440
+
441
+ word_highlights = []
442
+ phoneme_index = 0
443
+
444
+ for word_data in reference_words:
445
+ word = word_data["word"]
446
+ word_phonemes = word_data["phonemes"]
447
+ num_phonemes = len(word_phonemes)
448
+
449
+ # Get phoneme scores for this word
450
+ word_phoneme_scores = []
451
+ for j in range(num_phonemes):
452
+ if phoneme_index + j < len(phoneme_comparisons):
453
+ comparison = phoneme_comparisons[phoneme_index + j]
454
+ word_phoneme_scores.append(comparison["score"])
455
+
456
+ # Calculate word score
457
+ word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
458
+
459
+ # Create word highlight
460
+ highlight = {
461
+ "word": word,
462
+ "score": float(word_score),
463
+ "status": self._get_word_status(word_score),
464
+ "color": self._get_word_color(word_score),
465
+ "phonemes": word_phonemes,
466
+ "ipa": word_data["ipa"],
467
+ "phoneme_scores": word_phoneme_scores,
468
+ "phoneme_start_index": phoneme_index,
469
+ "phoneme_end_index": phoneme_index + num_phonemes - 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  }
471
+
472
+ word_highlights.append(highlight)
473
+ phoneme_index += num_phonemes
474
+
475
+ return word_highlights
476
+
477
+ def _identify_wrong_words(self, word_highlights: List[Dict],
478
+ phoneme_comparisons: List[Dict]) -> List[Dict]:
479
+ """Identify words that were pronounced incorrectly"""
480
+
481
+ wrong_words = []
482
+
483
+ for word_highlight in word_highlights:
484
+ if word_highlight["score"] < 0.6: # Threshold for wrong pronunciation
485
+
486
+ # Find specific phoneme errors for this word
487
+ start_idx = word_highlight["phoneme_start_index"]
488
+ end_idx = word_highlight["phoneme_end_index"]
489
+
490
+ wrong_phonemes = []
491
+ missing_phonemes = []
492
+
493
+ for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
494
+ comparison = phoneme_comparisons[i]
495
+
496
+ if comparison["status"] == "wrong":
497
+ wrong_phonemes.append({
498
+ "expected": comparison["reference_phoneme"],
499
+ "actual": comparison["learner_phoneme"],
500
+ "difficulty": comparison["difficulty"]
501
+ })
502
+ elif comparison["status"] == "missing":
503
+ missing_phonemes.append({
504
+ "phoneme": comparison["reference_phoneme"],
505
+ "difficulty": comparison["difficulty"]
506
+ })
507
+
508
+ wrong_word = {
509
+ "word": word_highlight["word"],
510
+ "score": word_highlight["score"],
511
+ "expected_phonemes": word_highlight["phonemes"],
512
+ "ipa": word_highlight["ipa"],
513
+ "wrong_phonemes": wrong_phonemes,
514
+ "missing_phonemes": missing_phonemes,
515
+ "tips": self._get_vietnamese_tips(wrong_phonemes, missing_phonemes)
516
  }
517
+
518
+ wrong_words.append(wrong_word)
519
+
520
+ return wrong_words
521
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  def _get_word_status(self, score: float) -> str:
523
  """Get word status from score"""
524
  if score >= 0.8:
 
529
  return "needs_practice"
530
  else:
531
  return "poor"
532
+
533
+ def _get_word_color(self, score: float) -> str:
534
+ """Get color for word highlighting"""
535
+ if score >= 0.8:
536
+ return "#22c55e" # Green
537
+ elif score >= 0.6:
538
+ return "#84cc16" # Light green
539
+ elif score >= 0.4:
540
+ return "#eab308" # Yellow
541
+ else:
542
+ return "#ef4444" # Red
543
+
544
+ def _get_vietnamese_tips(self, wrong_phonemes: List[Dict],
545
+ missing_phonemes: List[Dict]) -> List[str]:
546
+ """Get Vietnamese-specific pronunciation tips"""
547
+
548
+ tips = []
549
+
550
+ # Tips for specific Vietnamese pronunciation challenges
551
+ vietnamese_tips = {
552
+ "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
553
+ "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
554
+ "v": "Chạm môi dưới vào răng trên, không dùng cả hai môi như tiếng Việt",
555
+ "r": "Cuộn lưỡi nhưng không chạm vào vòm miệng, không lăn lưỡi",
556
+ "l": "Đầu lưỡi chạm vào vòm miệng sau răng",
557
+ "z": "Giống âm 's' nhưng có rung dây thanh âm",
558
+ "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
559
+ "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'"
560
+ }
561
+
562
+ # Add tips for wrong phonemes
563
+ for wrong in wrong_phonemes:
564
+ expected = wrong["expected"]
565
+ actual = wrong["actual"]
566
+
567
+ if expected in vietnamese_tips:
568
+ tips.append(f"Âm '{expected}': {vietnamese_tips[expected]}")
569
+ else:
570
+ tips.append(f"Luyện âm '{expected}' thay vì '{actual}'")
571
+
572
+ # Add tips for missing phonemes
573
+ for missing in missing_phonemes:
574
+ phoneme = missing["phoneme"]
575
+ if phoneme in vietnamese_tips:
576
+ tips.append(f"Thiếu âm '{phoneme}': {vietnamese_tips[phoneme]}")
577
+
578
+ return tips
579
 
580
+ # =============================================================================
581
+ # FEEDBACK GENERATOR
582
+ # =============================================================================
583
 
584
+ class SimpleFeedbackGenerator:
585
+ """Generate simple, actionable feedback in Vietnamese"""
586
+
587
+ def generate_feedback(self, overall_score: float, wrong_words: List[Dict],
588
+ phoneme_comparisons: List[Dict]) -> List[str]:
589
+ """Generate Vietnamese feedback"""
590
+
591
+ feedback = []
592
+
593
+ # Overall feedback in Vietnamese
594
+ if overall_score >= 0.8:
595
+ feedback.append("Phát âm rất tốt! Bạn đã làm xuất sắc.")
596
+ elif overall_score >= 0.6:
597
+ feedback.append("Phát âm khá tốt, còn một vài điểm cần cải thiện.")
598
+ elif overall_score >= 0.4:
599
+ feedback.append("Cần luyện tập thêm. Tập trung vào những từ được đánh dấu đỏ.")
600
+ else:
601
+ feedback.append("Hãy luyện tập chậm và rõ ràng hơn.")
602
+
603
+ # Wrong words feedback
604
+ if wrong_words:
605
+ if len(wrong_words) <= 3:
606
+ word_names = [w["word"] for w in wrong_words]
607
+ feedback.append(f"Các từ cần luyện tập: {', '.join(word_names)}")
608
+ else:
609
+ feedback.append(f"Có {len(wrong_words)} từ cần luyện tập. Tập trung vào từng từ một.")
610
+
611
+ # Most problematic phonemes
612
+ problem_phonemes = defaultdict(int)
613
+ for comparison in phoneme_comparisons:
614
+ if comparison["status"] in ["wrong", "missing"]:
615
+ phoneme = comparison["reference_phoneme"]
616
+ problem_phonemes[phoneme] += 1
617
+
618
+ if problem_phonemes:
619
+ most_difficult = sorted(problem_phonemes.items(), key=lambda x: x[1], reverse=True)
620
+ top_problem = most_difficult[0][0]
621
+
622
+ phoneme_tips = {
623
+ "θ": "Lưỡi giữa răng, thổi nhẹ",
624
+ "ð": "Lưỡi giữa răng, rung dây thanh",
625
+ "v": "Môi dưới chạm răng trên",
626
+ "r": "Cuộn lưỡi, không chạm vòm miệng",
627
+ "l": "Lưỡi chạm vòm miệng",
628
+ "z": "Như 's' nhưng rung dây thanh"
629
+ }
630
+
631
+ if top_problem in phoneme_tips:
632
+ feedback.append(f"Âm khó nhất '{top_problem}': {phoneme_tips[top_problem]}")
633
+
634
+ return feedback
635
 
636
  # =============================================================================
637
+ # MAIN PRONUNCIATION ASSESSOR
638
  # =============================================================================
639
 
640
+ class SimplePronunciationAssessor:
641
+ """Main pronunciation assessor using Wav2Vec2 character-level model"""
642
+
643
+ def __init__(self):
644
+ print("Initializing Simple Pronunciation Assessor...")
645
+ self.asr = Wav2Vec2CharacterASR() # Updated to use character-based ASR
646
+ self.word_analyzer = WordAnalyzer()
647
+ self.feedback_generator = SimpleFeedbackGenerator()
648
+ print("Initialization completed")
649
+
650
+ def assess_pronunciation(self, audio_path: str, reference_text: str) -> Dict:
651
+ """
652
+ Main assessment function
653
+ Input: Audio path + Reference text
654
+ Output: Word highlights + Phoneme differences + Wrong words
655
+ """
656
+
657
+ print("Starting pronunciation assessment...")
658
+
659
+ # Step 1: Wav2Vec2 character transcription (no language model)
660
+ print("Step 1: Transcribing to characters...")
661
+ asr_result = self.asr.transcribe_to_characters(audio_path)
662
+ character_transcript = asr_result["character_transcript"]
663
+ phoneme_representation = asr_result["phoneme_representation"]
664
+
665
+ print(f"Character transcript: {character_transcript}")
666
+ print(f"Phoneme representation: {phoneme_representation}")
667
+
668
+ # Step 2: Word analysis using phoneme representation
669
+ print("Step 2: Analyzing words...")
670
+ analysis_result = self.word_analyzer.analyze_words(reference_text, phoneme_representation)
671
+
672
+ # Step 3: Calculate overall score
673
+ phoneme_comparisons = analysis_result["phoneme_differences"]
674
+ overall_score = self._calculate_overall_score(phoneme_comparisons)
675
+
676
+ # Step 4: Generate feedback
677
+ print("Step 3: Generating feedback...")
678
+ feedback = self.feedback_generator.generate_feedback(
679
+ overall_score, analysis_result["wrong_words"], phoneme_comparisons
680
+ )
681
+
682
+ result = {
683
+ "transcript": character_transcript, # What user actually said
684
+ "transcript_phonemes": phoneme_representation,
685
+ "user_phonemes": phoneme_representation, # Alias for UI clarity
686
+ "character_transcript": character_transcript,
687
+ "overall_score": overall_score,
688
+ "word_highlights": analysis_result["word_highlights"],
689
+ "phoneme_differences": phoneme_comparisons,
690
+ "wrong_words": analysis_result["wrong_words"],
691
+ "feedback": feedback,
692
+ "processing_info": {
693
+ "model_used": f"Wav2Vec2-Character ({self.asr.model_name})",
694
+ "character_based": True,
695
+ "language_model_correction": False,
696
+ "raw_output": True
697
+ }
698
+ }
699
+
700
+ print("Assessment completed successfully")
701
+ return result
702
+
703
+ def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
704
+ """Calculate overall pronunciation score"""
705
+ if not phoneme_comparisons:
706
+ return 0.0
707
+
708
+ total_score = sum(comparison["score"] for comparison in phoneme_comparisons)
709
+ return total_score / len(phoneme_comparisons)
710
 
711
  # =============================================================================
712
+ # API ENDPOINT
713
  # =============================================================================
714
 
715
+ # Initialize assessor
716
+ assessor = SimplePronunciationAssessor()
717
+
718
+ def convert_numpy_types(obj):
719
+ """Convert numpy types to Python native types"""
720
+ if isinstance(obj, np.integer):
721
+ return int(obj)
722
+ elif isinstance(obj, np.floating):
723
+ return float(obj)
724
+ elif isinstance(obj, np.ndarray):
725
+ return obj.tolist()
726
+ elif isinstance(obj, dict):
727
+ return {key: convert_numpy_types(value) for key, value in obj.items()}
728
+ elif isinstance(obj, list):
729
+ return [convert_numpy_types(item) for item in obj]
730
+ else:
731
+ return obj
732
+
733
+ @router.post("/assess", response_model=PronunciationAssessmentResult)
734
  async def assess_pronunciation(
735
+ audio: UploadFile = File(..., description="Audio file (.wav, .mp3, .m4a)"),
736
+ reference_text: str = Form(..., description="Reference text to pronounce")
 
737
  ):
738
  """
739
+ Pronunciation Assessment API using Wav2Vec2 Character-level Model
740
+
741
+ Key Features:
742
+ - Uses facebook/wav2vec2-large-960h-lv60-self for character transcription
743
+ - NO language model correction (shows actual pronunciation errors)
744
+ - Character-level accuracy converted to phoneme representation
745
+ - Vietnamese-optimized feedback and tips
746
+
747
+ Input: Audio file + Reference text
748
+ Output: Word highlights + Phoneme differences + Wrong words
749
  """
750
+
751
  import time
 
752
  start_time = time.time()
 
 
 
 
753
 
754
  # Validate inputs
755
  if not reference_text.strip():
 
756
  raise HTTPException(status_code=400, detail="Reference text cannot be empty")
757
+
758
+ if len(reference_text) > 500:
759
+ raise HTTPException(status_code=400, detail="Reference text too long (max 500 characters)")
760
+
761
+ # Check for valid English characters
 
 
 
 
762
  if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text):
 
 
763
  raise HTTPException(
764
  status_code=400,
765
+ detail="Text must contain only English letters, spaces, and basic punctuation"
766
  )
767
+
768
  try:
769
+ # Save uploaded file temporarily
 
 
770
  file_extension = ".wav"
771
+ if audio.filename and "." in audio.filename:
772
+ file_extension = f".{audio.filename.split('.')[-1]}"
773
 
774
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp_file:
 
 
775
  content = await audio.read()
776
  tmp_file.write(content)
777
  tmp_file.flush()
778
+
779
+ print(f"Processing audio file: {tmp_file.name}")
780
+
781
+ # Run assessment using Wav2Vec2 Character model
782
+ result = assessor.assess_pronunciation(tmp_file.name, reference_text)
783
+
784
+
785
+ # Add processing time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  processing_time = time.time() - start_time
787
+ result["processing_info"]["processing_time"] = processing_time
788
+
789
+ # Convert numpy types for JSON serialization
790
+ final_result = convert_numpy_types(result)
791
+
792
+ print(f"Assessment completed in {processing_time:.2f} seconds")
793
+
794
+ return PronunciationAssessmentResult(**final_result)
795
+
 
 
 
 
796
  except Exception as e:
797
+ print(f"Assessment error: {str(e)}")
798
  import traceback
799
  traceback.print_exc()
800
+ raise HTTPException(status_code=500, detail=f"Assessment failed: {str(e)}")
801
 
802
+ # =============================================================================
803
+ # UTILITY ENDPOINTS
804
+ # =============================================================================
805
 
806
  @router.get("/phonemes/{word}")
807
  async def get_word_phonemes(word: str):
808
+ """Get phoneme breakdown for a specific word"""
809
  try:
810
+ g2p = SimpleG2P()
811
+ phoneme_data = g2p.text_to_phonemes(word)[0]
812
+
813
+ # Add difficulty analysis for Vietnamese speakers
814
+ difficulty_scores = []
815
+ comparator = PhonemeComparator()
816
+
817
+ for phoneme in phoneme_data["phonemes"]:
818
+ difficulty = comparator.difficulty_map.get(phoneme, 0.3)
819
+ difficulty_scores.append(difficulty)
820
+
821
+ avg_difficulty = float(np.mean(difficulty_scores)) if difficulty_scores else 0.3
822
+
 
 
 
 
 
 
 
 
823
  return {
824
  "word": word,
825
+ "phonemes": phoneme_data["phonemes"],
826
+ "phoneme_string": phoneme_data["phoneme_string"],
827
+ "ipa": phoneme_data["ipa"],
828
+ "difficulty_score": avg_difficulty,
829
+ "difficulty_level": "hard" if avg_difficulty > 0.6 else "medium" if avg_difficulty > 0.4 else "easy",
830
+ "challenging_phonemes": [
831
+ {
832
+ "phoneme": p,
833
+ "difficulty": comparator.difficulty_map.get(p, 0.3),
834
+ "vietnamese_tip": get_vietnamese_tip(p)
835
+ }
836
+ for p in phoneme_data["phonemes"]
837
+ if comparator.difficulty_map.get(p, 0.3) > 0.6
838
+ ]
839
  }
840
+
841
  except Exception as e:
842
+ raise HTTPException(status_code=500, detail=f"Word analysis error: {str(e)}")
843
 
844
+ @router.get("/health")
845
+ async def health_check():
846
+ """Health check endpoint"""
 
847
  try:
848
+ model_info = {
849
+ "status": "healthy",
850
+ "model": assessor.asr.model_name,
851
+ "character_based": True,
852
+ "language_model_correction": False,
853
+ "vietnamese_optimized": True
 
 
 
 
 
 
 
 
 
854
  }
855
+ return model_info
856
  except Exception as e:
857
+ return {
858
+ "status": "error",
859
+ "error": str(e)
860
+ }
861
 
862
+ @router.get("/test-model")
863
+ async def test_model():
864
+ """Test if Wav2Vec2 model is working"""
865
  try:
866
+ # Test model info
867
+ test_result = {
868
+ "model_loaded": True,
869
+ "model_name": assessor.asr.model_name,
870
+ "processor_ready": True,
871
+ "sample_rate": assessor.asr.sample_rate,
872
+ "sample_characters": "this is a test",
873
+ "sample_phonemes": "ðɪs ɪz ə tɛst"
874
+ }
875
+ return test_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877
  return {
878
+ "model_loaded": False,
879
+ "error": str(e)
 
 
880
  }
881
 
 
 
 
 
882
  # =============================================================================
883
  # HELPER FUNCTIONS
884
  # =============================================================================
885
 
886
+ def get_vietnamese_tip(phoneme: str) -> str:
887
+ """Get Vietnamese pronunciation tip for a phoneme"""
888
+ tips = {
889
+ "θ": "Đặt lưỡi giữa răng, thổi nhẹ",
890
+ "ð": "Giống θ nhưng rung dây thanh âm",
891
+ "v": "Môi dưới chạm răng trên",
892
+ "r": "Cuộn lưỡi, không chạm vòm miệng",
893
+ "l": "Lưỡi chạm vòm miệng sau răng",
894
+ "z": "Như 's' nhưng rung dây thanh",
895
+ "ʒ": "Như 'ʃ' nhưng rung dây thanh",
896
+ "w": "Tròn môi như 'u'"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
897
  }
898
+ return tips.get(phoneme, f"Luyện âm {phoneme}")