ABAO77 commited on
Commit
64c08d9
·
1 Parent(s): 6020910

feat: add speaking route for pronunciation assessment API

Browse files
requirements.txt CHANGED
@@ -13,4 +13,7 @@ langchain-google-genai
13
  python-dotenv
14
  loguru
15
  python-multipart
16
- deepgram-sdk
 
 
 
 
13
  python-dotenv
14
  loguru
15
  python-multipart
16
+ deepgram-sdk
17
+ whisper-openai
18
+ librosa
19
+ eng-to-ipa
src/apis/create_app.py CHANGED
@@ -4,12 +4,14 @@ from src.apis.routes.user_route import router as router_user
4
  from src.apis.routes.chat_route import router as router_chat
5
  from src.apis.routes.lesson_route import router as router_lesson
6
  from src.apis.routes.evaluation_route import router as router_evaluation
 
7
 
8
  api_router = APIRouter(prefix="/api")
9
  api_router.include_router(router_user)
10
  api_router.include_router(router_chat)
11
  api_router.include_router(router_lesson)
12
  api_router.include_router(router_evaluation)
 
13
 
14
 
15
  def create_app():
 
4
  from src.apis.routes.chat_route import router as router_chat
5
  from src.apis.routes.lesson_route import router as router_lesson
6
  from src.apis.routes.evaluation_route import router as router_evaluation
7
+ from src.apis.routes.speaking_route import router as router_speaking
8
 
9
  api_router = APIRouter(prefix="/api")
10
  api_router.include_router(router_user)
11
  api_router.include_router(router_chat)
12
  api_router.include_router(router_lesson)
13
  api_router.include_router(router_evaluation)
14
+ api_router.include_router(router_speaking)
15
 
16
 
17
  def create_app():
src/apis/routes/speaking_route.py ADDED
@@ -0,0 +1,714 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SIMPLIFIED PRONUNCIATION ASSESSMENT API
2
+ # Input: Audio + Reference Text → Output: Word highlights + Phoneme diff + Wrong words
3
+
4
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException, APIRouter
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from pydantic import BaseModel
7
+ from typing import List, Dict, Optional
8
+ import tempfile
9
+ import os
10
+ import numpy as np
11
+ import nltk
12
+ import eng_to_ipa as ipa
13
+ import whisper
14
+ import re
15
+ from collections import defaultdict
16
+ import warnings
17
+
18
+ warnings.filterwarnings("ignore")
19
+
20
+ # Download required NLTK data
21
+ try:
22
+ nltk.download("cmudict", quiet=True)
23
+ from nltk.corpus import cmudict
24
+ except:
25
+ print("Warning: NLTK data not available")
26
+
27
+ # =============================================================================
28
+ # MODELS
29
+ # =============================================================================
30
+
31
+ router = APIRouter(prefix="/pronunciation", tags=["Pronunciation"])
32
+
33
+
34
+ class PronunciationAssessmentResult(BaseModel):
35
+ transcript: str
36
+ overall_score: float
37
+ word_highlights: List[Dict]
38
+ phoneme_differences: List[Dict]
39
+ wrong_words: List[Dict]
40
+ feedback: List[str]
41
+
42
+
43
+ # =============================================================================
44
+ # CORE COMPONENTS
45
+ # =============================================================================
46
+
47
+
48
+ class SimpleG2P:
49
+ """Simple Grapheme-to-Phoneme converter"""
50
+
51
+ def __init__(self):
52
+ try:
53
+ self.cmu_dict = cmudict.dict()
54
+ except:
55
+ self.cmu_dict = {}
56
+ print("Warning: CMU dictionary not available")
57
+
58
+ def text_to_phonemes(self, text: str) -> List[Dict]:
59
+ """Convert text to phoneme sequence"""
60
+ words = self._clean_text(text).split()
61
+ phoneme_sequence = []
62
+
63
+ for word in words:
64
+ word_phonemes = self._get_word_phonemes(word)
65
+ phoneme_sequence.append(
66
+ {"word": word, "phonemes": word_phonemes, "ipa": self._get_ipa(word)}
67
+ )
68
+
69
+ return phoneme_sequence
70
+
71
+ def _clean_text(self, text: str) -> str:
72
+ """Clean text for processing"""
73
+ text = re.sub(r"[^\w\s\']", " ", text)
74
+ text = re.sub(r"\s+", " ", text)
75
+ return text.lower().strip()
76
+
77
+ def _get_word_phonemes(self, word: str) -> List[str]:
78
+ """Get phonemes for a word"""
79
+ word_lower = word.lower()
80
+
81
+ if word_lower in self.cmu_dict:
82
+ # Remove stress markers
83
+ phonemes = self.cmu_dict[word_lower][0]
84
+ return [re.sub(r"[0-9]", "", p) for p in phonemes]
85
+ else:
86
+ # Simple fallback
87
+ return self._estimate_phonemes(word)
88
+
89
+ def _get_ipa(self, word: str) -> str:
90
+ """Get IPA transcription"""
91
+ try:
92
+ return ipa.convert(word)
93
+ except:
94
+ return f"/{word}/"
95
+
96
+ def _estimate_phonemes(self, word: str) -> List[str]:
97
+ """Estimate phonemes for unknown words"""
98
+ phoneme_map = {
99
+ "ch": ["CH"],
100
+ "sh": ["SH"],
101
+ "th": ["TH"],
102
+ "ph": ["F"],
103
+ "ck": ["K"],
104
+ "ng": ["NG"],
105
+ "qu": ["K", "W"],
106
+ "a": ["AE"],
107
+ "e": ["EH"],
108
+ "i": ["IH"],
109
+ "o": ["AH"],
110
+ "u": ["AH"],
111
+ "b": ["B"],
112
+ "c": ["K"],
113
+ "d": ["D"],
114
+ "f": ["F"],
115
+ "g": ["G"],
116
+ "h": ["HH"],
117
+ "j": ["JH"],
118
+ "k": ["K"],
119
+ "l": ["L"],
120
+ "m": ["M"],
121
+ "n": ["N"],
122
+ "p": ["P"],
123
+ "r": ["R"],
124
+ "s": ["S"],
125
+ "t": ["T"],
126
+ "v": ["V"],
127
+ "w": ["W"],
128
+ "x": ["K", "S"],
129
+ "y": ["Y"],
130
+ "z": ["Z"],
131
+ }
132
+
133
+ word = word.lower()
134
+ phonemes = []
135
+ i = 0
136
+
137
+ while i < len(word):
138
+ # Check 2-letter combinations first
139
+ if i <= len(word) - 2:
140
+ two_char = word[i : i + 2]
141
+ if two_char in phoneme_map:
142
+ phonemes.extend(phoneme_map[two_char])
143
+ i += 2
144
+ continue
145
+
146
+ # Single character
147
+ char = word[i]
148
+ if char in phoneme_map:
149
+ phonemes.extend(phoneme_map[char])
150
+
151
+ i += 1
152
+
153
+ return phonemes
154
+
155
+
156
+ class SimplePhonemeComparator:
157
+ """Simple phoneme comparison"""
158
+
159
+ def __init__(self):
160
+ # Vietnamese difficulty map
161
+ self.difficulty_map = {
162
+ "TH": 0.9,
163
+ "DH": 0.9,
164
+ "V": 0.8,
165
+ "Z": 0.8,
166
+ "ZH": 0.9,
167
+ "R": 0.7,
168
+ "L": 0.6,
169
+ "W": 0.5,
170
+ "F": 0.4,
171
+ "S": 0.3,
172
+ "SH": 0.5,
173
+ "CH": 0.4,
174
+ "JH": 0.5,
175
+ "NG": 0.3,
176
+ }
177
+
178
+ # Common substitution patterns for Vietnamese speakers
179
+ self.substitution_patterns = {
180
+ "TH": ["F", "S", "T"],
181
+ "DH": ["D", "Z", "V"],
182
+ "V": ["W", "F"],
183
+ "R": ["L"],
184
+ "L": ["R"],
185
+ "Z": ["S"],
186
+ }
187
+
188
+ def compare_phonemes(
189
+ self, reference_phonemes: List[Dict], learner_phonemes: List[Dict]
190
+ ) -> List[Dict]:
191
+ """Compare reference and learner phoneme sequences"""
192
+
193
+ # Flatten phoneme sequences
194
+ ref_sequence = []
195
+ learner_sequence = []
196
+
197
+ for word_data in reference_phonemes:
198
+ for phoneme in word_data["phonemes"]:
199
+ ref_sequence.append({"phoneme": phoneme, "word": word_data["word"]})
200
+
201
+ for word_data in learner_phonemes:
202
+ for phoneme in word_data["phonemes"]:
203
+ learner_sequence.append({"phoneme": phoneme, "word": word_data["word"]})
204
+
205
+ # Simple alignment and comparison
206
+ comparisons = []
207
+ max_len = max(len(ref_sequence), len(learner_sequence))
208
+
209
+ for i in range(max_len):
210
+ ref_item = ref_sequence[i] if i < len(ref_sequence) else None
211
+ learner_item = learner_sequence[i] if i < len(learner_sequence) else None
212
+
213
+ if ref_item and learner_item:
214
+ ref_phoneme = ref_item["phoneme"]
215
+ learner_phoneme = learner_item["phoneme"]
216
+
217
+ if ref_phoneme == learner_phoneme:
218
+ status = "correct"
219
+ score = 1.0
220
+ elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
221
+ status = "acceptable"
222
+ score = 0.7
223
+ else:
224
+ status = "wrong"
225
+ score = 0.3
226
+
227
+ comparisons.append(
228
+ {
229
+ "position": i,
230
+ "reference_phoneme": ref_phoneme,
231
+ "learner_phoneme": learner_phoneme,
232
+ "status": status,
233
+ "score": score,
234
+ "word": ref_item["word"],
235
+ "difficulty": self.difficulty_map.get(ref_phoneme, 0.3),
236
+ }
237
+ )
238
+
239
+ elif ref_item and not learner_item:
240
+ # Missing phoneme
241
+ comparisons.append(
242
+ {
243
+ "position": i,
244
+ "reference_phoneme": ref_item["phoneme"],
245
+ "learner_phoneme": "",
246
+ "status": "missing",
247
+ "score": 0.0,
248
+ "word": ref_item["word"],
249
+ "difficulty": self.difficulty_map.get(ref_item["phoneme"], 0.3),
250
+ }
251
+ )
252
+
253
+ elif learner_item and not ref_item:
254
+ # Extra phoneme
255
+ comparisons.append(
256
+ {
257
+ "position": i,
258
+ "reference_phoneme": "",
259
+ "learner_phoneme": learner_item["phoneme"],
260
+ "status": "extra",
261
+ "score": 0.0,
262
+ "word": learner_item["word"],
263
+ "difficulty": 0.3,
264
+ }
265
+ )
266
+
267
+ return comparisons
268
+
269
+ def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
270
+ """Check if substitution is acceptable for Vietnamese speakers"""
271
+ acceptable = self.substitution_patterns.get(reference, [])
272
+ return learner in acceptable
273
+
274
+
275
+ class SimplePronunciationAssessor:
276
+ """Simplified pronunciation assessor focused on core functionality"""
277
+
278
+ def __init__(self):
279
+ print("Initializing Whisper model...")
280
+ self.whisper_model = whisper.load_model("base.en", in_memory=True)
281
+ print("Whisper model loaded successfully")
282
+
283
+ self.g2p = SimpleG2P()
284
+ self.comparator = SimplePhonemeComparator()
285
+ self.sample_rate = 16000
286
+
287
+ def assess_pronunciation(self, audio_path: str, reference_text: str) -> Dict:
288
+ """Main assessment function"""
289
+
290
+ # Step 1: Whisper ASR
291
+ print("Running Whisper transcription...")
292
+ asr_result = self.whisper_model.transcribe(audio_path)
293
+ transcript = asr_result["text"].strip()
294
+ print(f"Transcript: '{transcript}'")
295
+
296
+ # Step 2: Get reference phonemes
297
+ print("Getting reference phonemes...")
298
+ reference_phonemes = self.g2p.text_to_phonemes(reference_text)
299
+
300
+ # Step 3: Get learner phonemes from transcript
301
+ print("Getting learner phonemes...")
302
+ learner_phonemes = self.g2p.text_to_phonemes(transcript)
303
+
304
+ # Step 4: Compare phonemes
305
+ print("Comparing phonemes...")
306
+ phoneme_comparisons = self.comparator.compare_phonemes(
307
+ reference_phonemes, learner_phonemes
308
+ )
309
+
310
+ # Step 5: Generate word highlights
311
+ print("Generating word highlights...")
312
+ word_highlights = self._generate_word_highlights(
313
+ reference_phonemes, learner_phonemes, phoneme_comparisons
314
+ )
315
+
316
+ # Step 6: Identify wrong words
317
+ print("Identifying wrong words...")
318
+ wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)
319
+
320
+ # Step 7: Calculate overall score
321
+ overall_score = self._calculate_overall_score(phoneme_comparisons)
322
+
323
+ # Step 8: Generate feedback
324
+ feedback = self._generate_simple_feedback(
325
+ overall_score, wrong_words, phoneme_comparisons
326
+ )
327
+
328
+ return {
329
+ "transcript": transcript,
330
+ "overall_score": overall_score,
331
+ "word_highlights": word_highlights,
332
+ "phoneme_differences": phoneme_comparisons,
333
+ "wrong_words": wrong_words,
334
+ "feedback": feedback,
335
+ }
336
+
337
+ def _generate_word_highlights(
338
+ self,
339
+ reference_phonemes: List[Dict],
340
+ learner_phonemes: List[Dict],
341
+ phoneme_comparisons: List[Dict],
342
+ ) -> List[Dict]:
343
+ """Generate word highlighting data"""
344
+
345
+ word_highlights = []
346
+
347
+ # Group comparisons by word
348
+ word_scores = defaultdict(list)
349
+ for comparison in phoneme_comparisons:
350
+ word = comparison.get("word", "unknown")
351
+ if comparison["status"] in ["correct", "acceptable", "wrong"]:
352
+ word_scores[word].append(comparison["score"])
353
+
354
+ # Create highlights for reference words
355
+ for word_data in reference_phonemes:
356
+ word = word_data["word"]
357
+ scores = word_scores.get(word, [0.0])
358
+ avg_score = float(np.mean(scores))
359
+
360
+ highlight = {
361
+ "word": word,
362
+ "score": avg_score,
363
+ "status": self._get_word_status(avg_score),
364
+ "color": self._get_word_color(avg_score),
365
+ "phonemes": word_data["phonemes"],
366
+ "ipa": word_data["ipa"],
367
+ "issues": self._get_word_issues(word, phoneme_comparisons),
368
+ }
369
+
370
+ word_highlights.append(highlight)
371
+
372
+ return word_highlights
373
+
374
+ def _identify_wrong_words(
375
+ self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
376
+ ) -> List[Dict]:
377
+ """Identify words that were pronounced incorrectly"""
378
+
379
+ wrong_words = []
380
+
381
+ for word_highlight in word_highlights:
382
+ if word_highlight["score"] < 0.6: # Threshold for "wrong"
383
+ word = word_highlight["word"]
384
+
385
+ # Find specific issues for this word
386
+ word_issues = []
387
+ wrong_phonemes = []
388
+ missing_phonemes = []
389
+
390
+ for comparison in phoneme_comparisons:
391
+ if comparison.get("word") == word:
392
+ if comparison["status"] == "wrong":
393
+ wrong_phonemes.append(
394
+ {
395
+ "expected": comparison["reference_phoneme"],
396
+ "actual": comparison["learner_phoneme"],
397
+ }
398
+ )
399
+ elif comparison["status"] == "missing":
400
+ missing_phonemes.append(comparison["reference_phoneme"])
401
+
402
+ if wrong_phonemes:
403
+ word_issues.append(
404
+ f"Wrong sounds: {', '.join([p['expected'] for p in wrong_phonemes])}"
405
+ )
406
+
407
+ if missing_phonemes:
408
+ word_issues.append(f"Missing sounds: {', '.join(missing_phonemes)}")
409
+
410
+ wrong_word = {
411
+ "word": word,
412
+ "score": word_highlight["score"],
413
+ "expected_phonemes": word_highlight["phonemes"],
414
+ "ipa": word_highlight["ipa"],
415
+ "issues": word_issues,
416
+ "wrong_phonemes": wrong_phonemes,
417
+ "missing_phonemes": missing_phonemes,
418
+ "tips": self._get_pronunciation_tips(
419
+ word, wrong_phonemes, missing_phonemes
420
+ ),
421
+ }
422
+
423
+ wrong_words.append(wrong_word)
424
+
425
+ return wrong_words
426
+
427
+ def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
428
+ """Calculate overall pronunciation score"""
429
+ if not phoneme_comparisons:
430
+ return 0.0
431
+
432
+ total_score = 0.0
433
+ for comparison in phoneme_comparisons:
434
+ total_score += comparison["score"]
435
+
436
+ return total_score / len(phoneme_comparisons)
437
+
438
+ def _generate_simple_feedback(
439
+ self,
440
+ overall_score: float,
441
+ wrong_words: List[Dict],
442
+ phoneme_comparisons: List[Dict],
443
+ ) -> List[str]:
444
+ """Generate simple, actionable feedback"""
445
+
446
+ feedback = []
447
+
448
+ # Overall feedback
449
+ if overall_score >= 0.8:
450
+ feedback.append("Phát âm tốt! Bạn đã làm rất tốt.")
451
+ elif overall_score >= 0.6:
452
+ feedback.append("Phát âm khá tốt, còn một vài điểm cần cải thiện.")
453
+ elif overall_score >= 0.4:
454
+ feedback.append(
455
+ "Cần luyện tập thêm. Tập trung vào những từ được đánh dấu đỏ."
456
+ )
457
+ else:
458
+ feedback.append("Hãy luyện tập chậm và rõ ràng hơn.")
459
+
460
+ # Wrong words feedback
461
+ if wrong_words:
462
+ word_names = [w["word"] for w in wrong_words[:3]]
463
+ feedback.append(f"Các từ cần luyện tập: {', '.join(word_names)}")
464
+
465
+ # Phoneme-specific feedback for Vietnamese speakers
466
+ problem_phonemes = defaultdict(int)
467
+ for comparison in phoneme_comparisons:
468
+ if comparison["status"] == "wrong":
469
+ phoneme = comparison["reference_phoneme"]
470
+ problem_phonemes[phoneme] += 1
471
+
472
+ # Vietnamese-specific tips for most problematic sounds
473
+ vietnamese_tips = {
474
+ "TH": "Đặt lưỡi giữa răng, thổi nhẹ",
475
+ "DH": "Giống TH nhưng rung dây thanh",
476
+ "V": "Chạm môi dưới vào răng trên",
477
+ "R": "Cuộn lưỡi, không chạm vòm miệng",
478
+ "L": "Đầu lưỡi chạm vòm miệng",
479
+ "Z": "Giống S nhưng có rung dây thanh",
480
+ }
481
+
482
+ if problem_phonemes:
483
+ most_difficult = sorted(
484
+ problem_phonemes.items(), key=lambda x: x[1], reverse=True
485
+ )
486
+ for phoneme, count in most_difficult[:2]:
487
+ if phoneme in vietnamese_tips:
488
+ feedback.append(f"Âm {phoneme}: {vietnamese_tips[phoneme]}")
489
+
490
+ return feedback
491
+
492
+ def _get_word_status(self, score: float) -> str:
493
+ """Get word status from score"""
494
+ if score >= 0.8:
495
+ return "excellent"
496
+ elif score >= 0.6:
497
+ return "good"
498
+ elif score >= 0.4:
499
+ return "needs_practice"
500
+ else:
501
+ return "poor"
502
+
503
+ def _get_word_color(self, score: float) -> str:
504
+ """Get color for word highlighting"""
505
+ if score >= 0.8:
506
+ return "#22c55e" # Green
507
+ elif score >= 0.6:
508
+ return "#84cc16" # Light green
509
+ elif score >= 0.4:
510
+ return "#eab308" # Yellow
511
+ else:
512
+ return "#ef4444" # Red
513
+
514
+ def _get_word_issues(self, word: str, phoneme_comparisons: List[Dict]) -> List[str]:
515
+ """Get specific issues for a word"""
516
+ issues = []
517
+
518
+ word_comparisons = [c for c in phoneme_comparisons if c.get("word") == word]
519
+
520
+ wrong_count = len([c for c in word_comparisons if c["status"] == "wrong"])
521
+ missing_count = len([c for c in word_comparisons if c["status"] == "missing"])
522
+
523
+ if wrong_count > 0:
524
+ issues.append(f"{wrong_count} sai âm")
525
+ if missing_count > 0:
526
+ issues.append(f"{missing_count} thiếu âm")
527
+
528
+ return issues
529
+
530
+ def _get_pronunciation_tips(
531
+ self, word: str, wrong_phonemes: List[Dict], missing_phonemes: List[str]
532
+ ) -> List[str]:
533
+ """Get pronunciation tips for wrong words"""
534
+ tips = []
535
+
536
+ # Tips for specific problematic phonemes
537
+ phoneme_tips = {
538
+ "TH": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ",
539
+ "DH": "Giống TH nhưng rung dây thanh âm",
540
+ "V": "Chạm môi dưới vào răng trên, không dùng cả hai môi",
541
+ "R": "Cuộn lưỡi nhưng không chạm vào vòm miệng",
542
+ "L": "Đầu lưỡi chạm vào vòm miệng sau răng",
543
+ "Z": "Giống âm S nhưng có rung dây thanh âm",
544
+ }
545
+
546
+ # Add tips for wrong phonemes
547
+ for wrong in wrong_phonemes:
548
+ expected = wrong["expected"]
549
+ if expected in phoneme_tips:
550
+ tips.append(f"Âm {expected}: {phoneme_tips[expected]}")
551
+
552
+ # Add tips for missing phonemes
553
+ for missing in missing_phonemes:
554
+ if missing in phoneme_tips:
555
+ tips.append(f"Thiếu âm {missing}: {phoneme_tips[missing]}")
556
+
557
+ # General tip if no specific tips
558
+ if not tips:
559
+ tips.append(f"Luyện tập từ '{word}' chậm và rõ ràng")
560
+
561
+ return tips
562
+
563
+
564
+ # =============================================================================
565
+ # MAIN API ENDPOINT
566
+ # =============================================================================
567
+
568
+ # Initialize assessor
569
+ assessor = SimplePronunciationAssessor()
570
+
571
+
572
+ def convert_numpy_types(obj):
573
+ """Convert numpy types to Python native types"""
574
+ if isinstance(obj, np.integer):
575
+ return int(obj)
576
+ elif isinstance(obj, np.floating):
577
+ return float(obj)
578
+ elif isinstance(obj, np.ndarray):
579
+ return obj.tolist()
580
+ elif isinstance(obj, dict):
581
+ return {key: convert_numpy_types(value) for key, value in obj.items()}
582
+ elif isinstance(obj, list):
583
+ return [convert_numpy_types(item) for item in obj]
584
+ else:
585
+ return obj
586
+
587
+
588
+ @router.post("/assess", response_model=PronunciationAssessmentResult)
589
+ async def assess_pronunciation(
590
+ audio: UploadFile = File(..., description="Audio file (.wav, .mp3, .m4a)"),
591
+ reference_text: str = Form(..., description="Reference text to compare against"),
592
+ ):
593
+ """
594
+ Main API: Pronunciation Assessment
595
+
596
+ Input: Audio file + Reference text
597
+ Output: Word highlights + Phoneme differences + Wrong words
598
+
599
+ Features:
600
+ - Whisper ASR for transcript
601
+ - CMU Dict phoneme mapping
602
+ - Vietnamese-optimized comparison
603
+ - Simple UI-ready output
604
+ """
605
+
606
+ import time
607
+
608
+ start_time = time.time()
609
+
610
+ # Validate inputs
611
+ if not reference_text.strip():
612
+ raise HTTPException(status_code=400, detail="Reference text cannot be empty")
613
+
614
+ if len(reference_text) > 500:
615
+ raise HTTPException(
616
+ status_code=400, detail="Reference text too long (max 500 characters)"
617
+ )
618
+
619
+ # Check for valid English characters
620
+ if not re.match(r"^[a-zA-Z\s\'\-\.!?,;:]+$", reference_text):
621
+ raise HTTPException(
622
+ status_code=400,
623
+ detail="Text must contain only English letters, spaces, and basic punctuation",
624
+ )
625
+
626
+ try:
627
+ # Save uploaded file temporarily
628
+ file_extension = ".wav"
629
+ if audio.filename and "." in audio.filename:
630
+ file_extension = f".{audio.filename.split('.')[-1]}"
631
+
632
+ with tempfile.NamedTemporaryFile(
633
+ delete=False, suffix=file_extension
634
+ ) as tmp_file:
635
+ content = await audio.read()
636
+ tmp_file.write(content)
637
+ tmp_file.flush()
638
+
639
+ print(f"Processing audio file: {tmp_file.name}")
640
+
641
+ # Run assessment
642
+ result = assessor.assess_pronunciation(tmp_file.name, reference_text)
643
+
644
+ # Clean up temporary file
645
+ os.unlink(tmp_file.name)
646
+
647
+ # Convert numpy types for JSON serialization
648
+ final_result = convert_numpy_types(result)
649
+
650
+ processing_time = time.time() - start_time
651
+ print(f"Assessment completed in {processing_time:.2f} seconds")
652
+
653
+ return PronunciationAssessmentResult(**final_result)
654
+
655
+ except Exception as e:
656
+ print(f"Assessment error: {str(e)}")
657
+ import traceback
658
+
659
+ traceback.print_exc()
660
+ raise HTTPException(status_code=500, detail=f"Assessment failed: {str(e)}")
661
+
662
+
663
+ # =============================================================================
664
+ # UTILITY ENDPOINTS
665
+ # =============================================================================
666
+
667
+
668
+ @router.get("/phonemes/{word}")
669
+ async def get_word_phonemes(word: str):
670
+ """Get phoneme breakdown for a specific word"""
671
+ try:
672
+ phoneme_data = assessor.g2p.text_to_phonemes(word)[0]
673
+
674
+ # Add difficulty analysis
675
+ difficulty_scores = []
676
+ for phoneme in phoneme_data["phonemes"]:
677
+ difficulty = assessor.comparator.difficulty_map.get(phoneme, 0.3)
678
+ difficulty_scores.append(difficulty)
679
+
680
+ avg_difficulty = float(np.mean(difficulty_scores)) if difficulty_scores else 0.3
681
+
682
+ return {
683
+ "word": word,
684
+ "phonemes": phoneme_data["phonemes"],
685
+ "ipa": phoneme_data["ipa"],
686
+ "difficulty_score": avg_difficulty,
687
+ "difficulty_level": (
688
+ "hard"
689
+ if avg_difficulty > 0.6
690
+ else "medium" if avg_difficulty > 0.4 else "easy"
691
+ ),
692
+ "challenging_phonemes": [
693
+ {
694
+ "phoneme": p,
695
+ "difficulty": assessor.comparator.difficulty_map.get(p, 0.3),
696
+ }
697
+ for p in phoneme_data["phonemes"]
698
+ if assessor.comparator.difficulty_map.get(p, 0.3) > 0.6
699
+ ],
700
+ }
701
+
702
+ except Exception as e:
703
+ raise HTTPException(status_code=500, detail=f"Word analysis error: {str(e)}")
704
+
705
+
706
+ @router.get("/health")
707
+ async def health_check():
708
+ """Simple health check endpoint"""
709
+ return {
710
+ "status": "healthy",
711
+ "whisper_model": "tiny",
712
+ "cmu_dict_size": len(assessor.g2p.cmu_dict),
713
+ "vietnamese_optimized": True,
714
+ }