Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 29

Commit

2c94679

verified ·

1 Parent(s): a00cee2

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +549 -240

services/streaming_voice_service.py CHANGED Viewed

@@ -14,40 +14,497 @@ from core.speechbrain_vad import SpeechBrainVAD
 from core.silero_vad import SileroVAD
 class StreamingVoiceService:
     def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
         self.client = groq_client
         self.rag_system = rag_system
         self.tts_service = tts_service
-        # Khởi tạo VAD
-        self.vad_processor = SileroVAD()
         self.is_listening = False
         self.speech_callback = None
-        self.is_processing = False  # Tránh xử lý chồng chéo
-        self.last_speech_time = 0
-        self.silence_timeout = 2.0  # 2 giây im lặng thì dừng
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
-        # Audio buffer for VAD
         self.audio_buffer = []
         self.buffer_lock = threading.Lock()
     def start_listening(self, speech_callback: Callable) -> bool:
-        """Bắt đầu lắng nghe với VAD"""
         if self.is_listening:
             return False
         self.speech_callback = speech_callback
-        self.last_speech_time = time.time()
         success = self.vad_processor.start_stream(self._on_speech_detected)
         if success:
             self.is_listening = True
             self.is_processing = False
-            print("🎙️ Đã bắt đầu lắng nghe với VAD")
         return success
     def stop_listening(self):
@@ -60,73 +517,51 @@ class StreamingVoiceService:
             self.audio_buffer = []
         print("🛑 Đã dừng lắng nghe")
-    def process_audio_chunk(self, audio_data: tuple) -> Dict[str, Any]:
-        """Xử lý audio chunk với VAD (dùng cho real-time streaming)"""
-        if not audio_data or not self.is_listening or self.is_processing:
-            return {
-                'transcription': "Đang lắng nghe...",
-                'response': "",
-                'tts_audio': None,
-                'status': 'listening'
-            }
-        try:
-            sample_rate, audio_array = audio_data
-            # Thêm vào buffer và xử lý với VAD
-            with self.buffer_lock:
-                self.audio_buffer.extend(audio_array)
-                # Giới hạn buffer để tránh tràn bộ nhớ
-                max_buffer_samples = sample_rate * 10  # 10 giây
-                if len(self.audio_buffer) > max_buffer_samples:
-                    self.audio_buffer = self.audio_buffer[-max_buffer_samples:]
-            # Xử lý với VAD
-            self.vad_processor.process_stream(audio_array, sample_rate)
-            # Kiểm tra timeout im lặng
-            current_time = time.time()
-            if current_time - self.last_speech_time > self.silence_timeout and len(self.audio_buffer) > 0:
-                self._process_final_audio()
-            return {
-                'transcription': "Đang lắng nghe...",
-                'response': "",
-                'tts_audio': None,
-                'status': 'listening'
-            }
-        except Exception as e:
-            print(f"❌ Lỗi xử lý audio chunk: {e}")
-            return {
-                'transcription': "",
-                'response': "",
-                'tts_audio': None,
-                'status': 'error'
-            }
     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
-        """Callback khi VAD phát hiện speech"""
-        print(f"🎯 VAD phát hiện speech segment: {len(speech_audio)/sample_rate:.2f}s")
-        self.last_speech_time = time.time()
-        # Chỉ xử lý nếu không đang xử lý cái khác
         if self.is_processing:
-            print("⚠️ Đang xử lý request trước đó, bỏ qua...")
             return
-        self.is_processing = True
         try:
             # Chuyển đổi speech thành text
             transcription = self._transcribe_audio(speech_audio, sample_rate)
             if not transcription or len(transcription.strip()) < 2:
                 print("⚠️ Transcription quá ngắn hoặc trống")
-                self.is_processing = False
                 return
-            print(f"📝 VAD Transcription: {transcription}")
             self.current_transcription = transcription
             # Tạo phản hồi AI
@@ -145,152 +580,56 @@ class StreamingVoiceService:
                 })
         except Exception as e:
-            print(f"❌ Lỗi trong _on_speech_detected: {e}")
         finally:
-            # Cho phép xử lý tiếp sau khi TTS kết thúc
-            threading.Timer(1.0, self._reset_processing).start()
-    def _reset_processing(self):
-        """Reset trạng thái xử lý sau khi hoàn thành"""
-        self.is_processing = False
-        with self.buffer_lock:
-            self.audio_buffer = []
-    def _process_final_audio(self):
-        """Xử lý audio cuối cùng khi hết thời gian im lặng"""
-        if self.is_processing or not self.audio_buffer:
-            return
-        try:
-            with self.buffer_lock:
-                if not self.audio_buffer:
-                    return
-                final_audio = np.array(self.audio_buffer)
-                self.audio_buffer = []
-            # Chỉ xử lý nếu audio đủ dài
-            if len(final_audio) > 16000 * 0.5:  # Ít nhất 0.5 giây
-                print("🔄 Xử lý audio cuối cùng do im lặng timeout")
-                self._on_speech_detected(final_audio, 16000)
-        except Exception as e:
-            print(f"❌ Lỗi xử lý final audio: {e}")
-    def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
-        """Xử lý audio streaming (phương thức cũ cho compatibility)"""
-        if not audio_data:
-            return {
-                'transcription': "❌ Không có dữ liệu âm thanh",
-                'response': "Vui lòng nói lại",
-                'tts_audio': None,
-                'status': 'error'
-            }
-        # Nếu đang xử lý VAD, trả về trạng thái listening
-        if self.is_processing:
             return {
-                'transcription': "Đang xử lý...",
                 'response': "",
                 'tts_audio': None,
-                'status': 'processing'
             }
         try:
-            # Lấy dữ liệu audio từ Gradio
             sample_rate, audio_array = audio_data
-            print(f"🎯 Nhận audio: {len(audio_array)} samples, SR: {sample_rate}")
-            # Kiểm tra kiểu dữ liệu và chuyển đổi nếu cần
-            if isinstance(audio_array, np.ndarray):
-                if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
-                    # Chuyển từ float sang int16
-                    audio_array = (audio_array * 32767).astype(np.int16)
-            # Kiểm tra audio có dữ liệu không
-            if len(audio_array) == 0:
-                return {
-                    'transcription': "❌ Âm thanh trống",
-                    'response': "Vui lòng nói lại",
-                    'tts_audio': None,
-                    'status': 'error'
-                }
-            # Tính toán âm lượng
-            audio_abs = np.abs(audio_array.astype(np.float32))
-            audio_rms = np.sqrt(np.mean(audio_abs**2)) / 32767.0
-            print(f"📊 Âm lượng RMS: {audio_rms:.4f}")
-            if audio_rms < 0.005:
-                return {
-                    'transcription': "❌ Âm thanh quá yếu",
-                    'response': "Xin vui lòng nói to hơn",
-                    'tts_audio': None,
-                    'status': 'error'
-                }
-            # Sử dụng VAD để kiểm tra speech
-            if not self.vad_processor.is_speech(audio_array, sample_rate):
-                return {
-                    'transcription': "❌ Không phát hiện giọng nói",
-                    'response': "Vui lòng nói rõ hơn",
-                    'tts_audio': None,
-                    'status': 'error'
-                }
-            # Chuyển đổi thành văn bản
-            transcription = self._transcribe_audio(audio_array, sample_rate)
-            if not transcription or len(transcription.strip()) == 0:
-                return {
-                    'transcription': "❌ Không nghe rõ",
-                    'response': "Xin vui lòng nói lại rõ hơn",
-                    'tts_audio': None,
-                    'status': 'error'
-                }
-            # Kiểm tra nếu transcription quá ngắn
-            if len(transcription.strip()) < 2:
-                return {
-                    'transcription': "❌ Câu nói quá ngắn",
-                    'response': "Xin vui lòng nói câu dài hơn",
-                    'tts_audio': None,
-                    'status': 'error'
-                }
-            print(f"📝 Đã chuyển đổi: {transcription}")
-            # Cập nhật transcription hiện tại
-            self.current_transcription = transcription
-            # Tạo phản hồi AI
-            response = self._generate_ai_response(transcription)
-            # Tạo TTS
-            tts_audio_path = self._text_to_speech(response)
             return {
-                'transcription': transcription,
-                'response': response,
-                'tts_audio': tts_audio_path,
-                'status': 'completed'
             }
         except Exception as e:
-            print(f"❌ Lỗi xử lý streaming audio: {e}")
-            print(f"Chi tiết lỗi: {traceback.format_exc()}")
             return {
-                'transcription': f"❌ Lỗi: {str(e)}",
-                'response': "Xin lỗi, có lỗi xảy ra trong quá trình xử lý",
                 'tts_audio': None,
                 'status': 'error'
             }
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
-        """Chuyển audio -> text với xử lý sample rate cải tiến"""
         try:
-            # Đảm bảo kiểu dữ liệu là int16
             if audio_data.dtype != np.int16:
                 if audio_data.dtype in [np.float32, np.float64]:
                     audio_data = (audio_data * 32767).astype(np.int16)
@@ -299,30 +638,26 @@ class StreamingVoiceService:
             # Chuẩn hóa audio data
             if audio_data.ndim > 1:
-                audio_data = np.mean(audio_data, axis=1).astype(np.int16)  # Chuyển sang mono
-            # Resample nếu sample rate không phải 16000Hz (Whisper yêu cầu)
             target_sample_rate = 16000
             if sample_rate != target_sample_rate:
                 audio_data = self._resample_audio(audio_data, sample_rate, target_sample_rate)
                 sample_rate = target_sample_rate
-                print(f"🔄 Đã resample từ {sample_rate}Hz xuống {target_sample_rate}Hz")
             # Giới hạn độ dài audio
-            max_duration = 10  # giây
             max_samples = sample_rate * max_duration
             if len(audio_data) > max_samples:
                 audio_data = audio_data[:max_samples]
-                print(f"⚠️ Cắt audio xuống còn {max_duration} giây")
             # Đảm bảo audio đủ dài
-            min_duration = 0.5  # giây
             min_samples = int(sample_rate * min_duration)
             if len(audio_data) < min_samples:
-                # Pad audio nếu quá ngắn
                 padding = np.zeros(min_samples - len(audio_data), dtype=np.int16)
                 audio_data = np.concatenate([audio_data, padding])
-                print(f"⚠️ Đã pad audio lên {min_duration} giây")
             print(f"🔊 Gửi audio đến Whisper: {len(audio_data)} samples, {sample_rate}Hz")
@@ -331,8 +666,7 @@ class StreamingVoiceService:
             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
-            # Gọi API Whisper với timeout
-            import requests
             try:
                 transcription = self.client.audio.transcriptions.create(
                     model=settings.WHISPER_MODEL,
@@ -341,9 +675,6 @@ class StreamingVoiceService:
                     language="vi",
                     temperature=0.0,
                 )
-            except requests.exceptions.Timeout:
-                print("❌ Whisper API timeout")
-                return None
             except Exception as e:
                 print(f"❌ Lỗi Whisper API: {e}")
                 return None
@@ -356,47 +687,13 @@ class StreamingVoiceService:
             else:
                 result = str(transcription).strip()
-            print(f"✅ Transcription thành công: '{result}'")
             return result
         except Exception as e:
             print(f"❌ Lỗi transcription: {e}")
-            print(f"Audio details: dtype={audio_data.dtype}, shape={audio_data.shape}, sr={sample_rate}")
             return None
-    def _resample_audio(self, audio_data: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
-        """Resample audio sử dụng scipy - cải tiến độ chính xác"""
-        try:
-            from scipy import signal
-            # Tính số samples mới
-            duration = len(audio_data) / orig_sr
-            new_length = int(duration * target_sr)
-            # Resample sử dụng scipy.signal.resample với windowing
-            resampled_audio = signal.resample(audio_data, new_length)
-            # Chuyển lại về int16
-            resampled_audio = np.clip(resampled_audio, -32768, 32767).astype(np.int16)
-            return resampled_audio
-        except ImportError:
-            print("⚠️ Không có scipy, sử dụng simple resampling")
-            # Simple resampling bằng interpolation
-            orig_length = len(audio_data)
-            new_length = int(orig_length * target_sr / orig_sr)
-            # Linear interpolation
-            x_old = np.linspace(0, 1, orig_length)
-            x_new = np.linspace(0, 1, new_length)
-            resampled_audio = np.interp(x_new, x_old, audio_data).astype(np.int16)
-            return resampled_audio
-        except Exception as e:
-            print(f"❌ Lỗi resample: {e}")
-            return audio_data
     def _generate_ai_response(self, user_input: str) -> str:
         """Sinh phản hồi AI với xử lý lỗi"""
         try:
@@ -414,8 +711,8 @@ Thông tin tham khảo:
 """
             messages = [{"role": "system", "content": system_prompt}]
-            # Giữ lại 4 tin nhắn gần nhất
-            messages.extend(self.conversation_history[-4:])
             completion = self.client.chat.completions.create(
                 model="llama-3.1-8b-instant",
@@ -428,8 +725,8 @@ Thông tin tham khảo:
             self.conversation_history.append({"role": "assistant", "content": response})
             # Giới hạn lịch sử
-            if len(self.conversation_history) > 8:
-                self.conversation_history = self.conversation_history[-8:]
             return response
@@ -438,7 +735,7 @@ Thông tin tham khảo:
             return "Xin lỗi, tôi gặp lỗi khi tạo phản hồi. Vui lòng thử lại."
     def _text_to_speech(self, text: str) -> Optional[str]:
-        """Chuyển văn bản thành giọng nói với xử lý lỗi"""
         try:
             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
                 return None
@@ -452,6 +749,17 @@ Thông tin tham khảo:
             print(f"❌ Lỗi TTS: {e}")
         return None
     def clear_conversation(self):
         """Xóa lịch sử hội thoại"""
         self.conversation_history = []
@@ -465,5 +773,6 @@ Thông tin tham khảo:
             'is_processing': self.is_processing,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription,
             'last_update': time.strftime("%H:%M:%S")
         }

 from core.silero_vad import SileroVAD
+# class StreamingVoiceService:
+#     def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
+#         self.client = groq_client
+#         self.rag_system = rag_system
+#         self.tts_service = tts_service
+#         # Khởi tạo VAD
+#         self.vad_processor = SileroVAD()
+#         self.is_listening = False
+#         self.speech_callback = None
+#         self.is_processing = False  # Tránh xử lý chồng chéo
+#         self.last_speech_time = 0
+#         self.silence_timeout = 2.0  # 2 giây im lặng thì dừng
+#         # Conversation context
+#         self.conversation_history = []
+#         self.current_transcription = ""
+#         # Audio buffer for VAD
+#         self.audio_buffer = []
+#         self.buffer_lock = threading.Lock()
+#     def start_listening(self, speech_callback: Callable) -> bool:
+#         """Bắt đầu lắng nghe với VAD"""
+#         if self.is_listening:
+#             return False
+#         self.speech_callback = speech_callback
+#         self.last_speech_time = time.time()
+#         success = self.vad_processor.start_stream(self._on_speech_detected)
+#         if success:
+#             self.is_listening = True
+#             self.is_processing = False
+#             print("🎙️ Đã bắt đầu lắng nghe với VAD")
+#         return success
+#     def stop_listening(self):
+#         """Dừng lắng nghe"""
+#         self.vad_processor.stop_stream()
+#         self.is_listening = False
+#         self.is_processing = False
+#         self.speech_callback = None
+#         with self.buffer_lock:
+#             self.audio_buffer = []
+#         print("🛑 Đã dừng lắng nghe")
+#     def process_audio_chunk(self, audio_data: tuple) -> Dict[str, Any]:
+#         """Xử lý audio chunk với VAD (dùng cho real-time streaming)"""
+#         if not audio_data or not self.is_listening or self.is_processing:
+#             return {
+#                 'transcription': "Đang lắng nghe...",
+#                 'response': "",
+#                 'tts_audio': None,
+#                 'status': 'listening'
+#             }
+#         try:
+#             sample_rate, audio_array = audio_data
+#             # Thêm vào buffer và xử lý với VAD
+#             with self.buffer_lock:
+#                 self.audio_buffer.extend(audio_array)
+#                 # Giới hạn buffer để tránh tràn bộ nhớ
+#                 max_buffer_samples = sample_rate * 10  # 10 giây
+#                 if len(self.audio_buffer) > max_buffer_samples:
+#                     self.audio_buffer = self.audio_buffer[-max_buffer_samples:]
+#             # Xử lý với VAD
+#             self.vad_processor.process_stream(audio_array, sample_rate)
+#             # Kiểm tra timeout im lặng
+#             current_time = time.time()
+#             if current_time - self.last_speech_time > self.silence_timeout and len(self.audio_buffer) > 0:
+#                 self._process_final_audio()
+#             return {
+#                 'transcription': "Đang lắng nghe...",
+#                 'response': "",
+#                 'tts_audio': None,
+#                 'status': 'listening'
+#             }
+#         except Exception as e:
+#             print(f"❌ Lỗi xử lý audio chunk: {e}")
+#             return {
+#                 'transcription': "",
+#                 'response': "",
+#                 'tts_audio': None,
+#                 'status': 'error'
+#             }
+#     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
+#         """Callback khi VAD phát hiện speech"""
+#         print(f"🎯 VAD phát hiện speech segment: {len(speech_audio)/sample_rate:.2f}s")
+#         self.last_speech_time = time.time()
+#         # Chỉ xử lý nếu không đang xử lý cái khác
+#         if self.is_processing:
+#             print("⚠️ Đang xử lý request trước đó, bỏ qua...")
+#             return
+#         self.is_processing = True
+#         try:
+#             # Chuyển đổi speech thành text
+#             transcription = self._transcribe_audio(speech_audio, sample_rate)
+#             if not transcription or len(transcription.strip()) < 2:
+#                 print("⚠️ Transcription quá ngắn hoặc trống")
+#                 self.is_processing = False
+#                 return
+#             print(f"📝 VAD Transcription: {transcription}")
+#             self.current_transcription = transcription
+#             # Tạo phản hồi AI
+#             response = self._generate_ai_response(transcription)
+#             # Tạo TTS
+#             tts_audio_path = self._text_to_speech(response)
+#             # G���i kết quả đến callback
+#             if self.speech_callback:
+#                 self.speech_callback({
+#                     'transcription': transcription,
+#                     'response': response,
+#                     'tts_audio': tts_audio_path,
+#                     'status': 'completed'
+#                 })
+#         except Exception as e:
+#             print(f"❌ Lỗi trong _on_speech_detected: {e}")
+#         finally:
+#             # Cho phép xử lý tiếp sau khi TTS kết thúc
+#             threading.Timer(1.0, self._reset_processing).start()
+#     def _reset_processing(self):
+#         """Reset trạng thái xử lý sau khi hoàn thành"""
+#         self.is_processing = False
+#         with self.buffer_lock:
+#             self.audio_buffer = []
+#     def _process_final_audio(self):
+#         """Xử lý audio cuối cùng khi hết thời gian im lặng"""
+#         if self.is_processing or not self.audio_buffer:
+#             return
+#         try:
+#             with self.buffer_lock:
+#                 if not self.audio_buffer:
+#                     return
+#                 final_audio = np.array(self.audio_buffer)
+#                 self.audio_buffer = []
+#             # Chỉ xử lý nếu audio đủ dài
+#             if len(final_audio) > 16000 * 0.5:  # Ít nhất 0.5 giây
+#                 print("🔄 Xử lý audio cuối cùng do im lặng timeout")
+#                 self._on_speech_detected(final_audio, 16000)
+#         except Exception as e:
+#             print(f"❌ Lỗi xử lý final audio: {e}")
+#     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
+#         """Xử lý audio streaming (phương thức cũ cho compatibility)"""
+#         if not audio_data:
+#             return {
+#                 'transcription': "❌ Không có dữ liệu âm thanh",
+#                 'response': "Vui lòng nói lại",
+#                 'tts_audio': None,
+#                 'status': 'error'
+#             }
+#         # Nếu đang xử lý VAD, trả về trạng thái listening
+#         if self.is_processing:
+#             return {
+#                 'transcription': "Đang xử lý...",
+#                 'response': "",
+#                 'tts_audio': None,
+#                 'status': 'processing'
+#             }
+#         try:
+#             # Lấy dữ liệu audio từ Gradio
+#             sample_rate, audio_array = audio_data
+#             print(f"🎯 Nhận audio: {len(audio_array)} samples, SR: {sample_rate}")
+#             # Kiểm tra kiểu dữ liệu và chuyển đổi nếu cần
+#             if isinstance(audio_array, np.ndarray):
+#                 if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
+#                     # Chuyển từ float sang int16
+#                     audio_array = (audio_array * 32767).astype(np.int16)
+#             # Kiểm tra audio có dữ liệu không
+#             if len(audio_array) == 0:
+#                 return {
+#                     'transcription': "❌ Âm thanh trống",
+#                     'response': "Vui lòng nói lại",
+#                     'tts_audio': None,
+#                     'status': 'error'
+#                 }
+#             # Tính toán âm lượng
+#             audio_abs = np.abs(audio_array.astype(np.float32))
+#             audio_rms = np.sqrt(np.mean(audio_abs**2)) / 32767.0
+#             print(f"📊 Âm lượng RMS: {audio_rms:.4f}")
+#             if audio_rms < 0.005:
+#                 return {
+#                     'transcription': "❌ Âm thanh quá yếu",
+#                     'response': "Xin vui lòng nói to hơn",
+#                     'tts_audio': None,
+#                     'status': 'error'
+#                 }
+#             # Sử dụng VAD để kiểm tra speech
+#             if not self.vad_processor.is_speech(audio_array, sample_rate):
+#                 return {
+#                     'transcription': "❌ Không phát hiện giọng nói",
+#                     'response': "Vui lòng nói rõ hơn",
+#                     'tts_audio': None,
+#                     'status': 'error'
+#                 }
+#             # Chuyển đổi thành văn bản
+#             transcription = self._transcribe_audio(audio_array, sample_rate)
+#             if not transcription or len(transcription.strip()) == 0:
+#                 return {
+#                     'transcription': "❌ Không nghe rõ",
+#                     'response': "Xin vui lòng nói lại rõ hơn",
+#                     'tts_audio': None,
+#                     'status': 'error'
+#                 }
+#             # Kiểm tra nếu transcription quá ngắn
+#             if len(transcription.strip()) < 2:
+#                 return {
+#                     'transcription': "��� Câu nói quá ngắn",
+#                     'response': "Xin vui lòng nói câu dài hơn",
+#                     'tts_audio': None,
+#                     'status': 'error'
+#                 }
+#             print(f"📝 Đã chuyển đổi: {transcription}")
+#             # Cập nhật transcription hiện tại
+#             self.current_transcription = transcription
+#             # Tạo phản hồi AI
+#             response = self._generate_ai_response(transcription)
+#             # Tạo TTS
+#             tts_audio_path = self._text_to_speech(response)
+#             return {
+#                 'transcription': transcription,
+#                 'response': response,
+#                 'tts_audio': tts_audio_path,
+#                 'status': 'completed'
+#             }
+#         except Exception as e:
+#             print(f"❌ Lỗi xử lý streaming audio: {e}")
+#             print(f"Chi tiết lỗi: {traceback.format_exc()}")
+#             return {
+#                 'transcription': f"❌ Lỗi: {str(e)}",
+#                 'response': "Xin lỗi, có lỗi xảy ra trong quá trình xử lý",
+#                 'tts_audio': None,
+#                 'status': 'error'
+#             }
+#     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
+#         """Chuyển audio -> text với xử lý sample rate cải tiến"""
+#         try:
+#             # Đảm bảo kiểu dữ liệu là int16
+#             if audio_data.dtype != np.int16:
+#                 if audio_data.dtype in [np.float32, np.float64]:
+#                     audio_data = (audio_data * 32767).astype(np.int16)
+#                 else:
+#                     audio_data = audio_data.astype(np.int16)
+#             # Chuẩn hóa audio data
+#             if audio_data.ndim > 1:
+#                 audio_data = np.mean(audio_data, axis=1).astype(np.int16)  # Chuyển sang mono
+#             # Resample nếu sample rate không phải 16000Hz (Whisper yêu cầu)
+#             target_sample_rate = 16000
+#             if sample_rate != target_sample_rate:
+#                 audio_data = self._resample_audio(audio_data, sample_rate, target_sample_rate)
+#                 sample_rate = target_sample_rate
+#                 print(f"🔄 Đã resample từ {sample_rate}Hz xuống {target_sample_rate}Hz")
+#             # Giới hạn độ dài audio
+#             max_duration = 10  # giây
+#             max_samples = sample_rate * max_duration
+#             if len(audio_data) > max_samples:
+#                 audio_data = audio_data[:max_samples]
+#                 print(f"⚠️ Cắt audio xuống còn {max_duration} giây")
+#             # Đảm bảo audio đủ dài
+#             min_duration = 0.5  # giây
+#             min_samples = int(sample_rate * min_duration)
+#             if len(audio_data) < min_samples:
+#                 # Pad audio nếu quá ngắn
+#                 padding = np.zeros(min_samples - len(audio_data), dtype=np.int16)
+#                 audio_data = np.concatenate([audio_data, padding])
+#                 print(f"⚠️ Đã pad audio lên {min_duration} giây")
+#             print(f"🔊 Gửi audio đến Whisper: {len(audio_data)} samples, {sample_rate}Hz")
+#             # Tạo temporary file trong memory
+#             buffer = io.BytesIO()
+#             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
+#             buffer.seek(0)
+#             # Gọi API Whisper với timeout
+#             import requests
+#             try:
+#                 transcription = self.client.audio.transcriptions.create(
+#                     model=settings.WHISPER_MODEL,
+#                     file=("speech.wav", buffer.read(), "audio/wav"),
+#                     response_format="text",
+#                     language="vi",
+#                     temperature=0.0,
+#                 )
+#             except requests.exceptions.Timeout:
+#                 print("❌ Whisper API timeout")
+#                 return None
+#             except Exception as e:
+#                 print(f"❌ Lỗi Whisper API: {e}")
+#                 return None
+#             # Xử lý response
+#             if hasattr(transcription, 'text'):
+#                 result = transcription.text.strip()
+#             elif isinstance(transcription, str):
+#                 result = transcription.strip()
+#             else:
+#                 result = str(transcription).strip()
+#             print(f"✅ Transcription thành công: '{result}'")
+#             return result
+#         except Exception as e:
+#             print(f"❌ Lỗi transcription: {e}")
+#             print(f"Audio details: dtype={audio_data.dtype}, shape={audio_data.shape}, sr={sample_rate}")
+#             return None
+#     def _resample_audio(self, audio_data: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+#         """Resample audio sử dụng scipy - cải tiến độ chính xác"""
+#         try:
+#             from scipy import signal
+#             # Tính số samples mới
+#             duration = len(audio_data) / orig_sr
+#             new_length = int(duration * target_sr)
+#             # Resample sử dụng scipy.signal.resample với windowing
+#             resampled_audio = signal.resample(audio_data, new_length)
+#             # Chuyển lại về int16
+#             resampled_audio = np.clip(resampled_audio, -32768, 32767).astype(np.int16)
+#             return resampled_audio
+#         except ImportError:
+#             print("⚠️ Không có scipy, sử dụng simple resampling")
+#             # Simple resampling bằng interpolation
+#             orig_length = len(audio_data)
+#             new_length = int(orig_length * target_sr / orig_sr)
+#             # Linear interpolation
+#             x_old = np.linspace(0, 1, orig_length)
+#             x_new = np.linspace(0, 1, new_length)
+#             resampled_audio = np.interp(x_new, x_old, audio_data).astype(np.int16)
+#             return resampled_audio
+#         except Exception as e:
+#             print(f"❌ Lỗi resample: {e}")
+#             return audio_data
+#     def _generate_ai_response(self, user_input: str) -> str:
+#         """Sinh phản hồi AI với xử lý lỗi"""
+#         try:
+#             # Thêm vào lịch sử
+#             self.conversation_history.append({"role": "user", "content": user_input})
+#             # Tìm kiếm RAG
+#             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
+#             context_text = "\n".join([f"- {result.get('text', str(result))}" for result in rag_results]) if rag_results else ""
+#             system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
+# Hãy trả lời ngắn gọn, tự nhiên và hữu ích (dưới 100 từ).
+# Thông tin tham khảo:
+# {context_text}
+# """
+#             messages = [{"role": "system", "content": system_prompt}]
+#             # Giữ lại 4 tin nhắn gần nhất
+#             messages.extend(self.conversation_history[-4:])
+#             completion = self.client.chat.completions.create(
+#                 model="llama-3.1-8b-instant",
+#                 messages=messages,
+#                 max_tokens=150,
+#                 temperature=0.7
+#             )
+#             response = completion.choices[0].message.content
+#             self.conversation_history.append({"role": "assistant", "content": response})
+#             # Giới hạn lịch sử
+#             if len(self.conversation_history) > 8:
+#                 self.conversation_history = self.conversation_history[-8:]
+#             return response
+#         except Exception as e:
+#             print(f"❌ Lỗi tạo AI response: {e}")
+#             return "Xin lỗi, tôi gặp lỗi khi tạo phản hồi. Vui lòng thử lại."
+#     def _text_to_speech(self, text: str) -> Optional[str]:
+#         """Chuyển văn bản thành giọng nói với xử lý lỗi"""
+#         try:
+#             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
+#                 return None
+#             tts_bytes = self.tts_service.text_to_speech(text, 'vi')
+#             if tts_bytes:
+#                 audio_path = self.tts_service.save_audio_to_file(tts_bytes)
+#                 print(f"✅ Đã tạo TTS: {audio_path}")
+#                 return audio_path
+#         except Exception as e:
+#             print(f"❌ Lỗi TTS: {e}")
+#         return None
+#     def clear_conversation(self):
+#         """Xóa lịch sử hội thoại"""
+#         self.conversation_history = []
+#         self.current_transcription = ""
+#         print("🗑️ Đã xóa lịch sử hội thoại")
+#     def get_conversation_state(self) -> dict:
+#         """Lấy trạng thái hội thoại"""
+#         return {
+#             'is_listening': self.is_listening,
+#             'is_processing': self.is_processing,
+#             'history_length': len(self.conversation_history),
+#             'current_transcription': self.current_transcription,
+#             'last_update': time.strftime("%H:%M:%S")
+#         }
 class StreamingVoiceService:
     def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
         self.client = groq_client
         self.rag_system = rag_system
         self.tts_service = tts_service
+        # Khởi tạo VAD tối ưu
+        self.vad_processor = OptimizedSileroVAD()
         self.is_listening = False
         self.speech_callback = None
+        self.is_processing = False
+        self.processing_lock = threading.Lock()
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
+        # Audio buffer
         self.audio_buffer = []
         self.buffer_lock = threading.Lock()
+        # Response queue để quản lý thứ tự xử lý
+        self.response_queue = queue.Queue()
+        self.current_task = None
     def start_listening(self, speech_callback: Callable) -> bool:
+        """Bắt đầu lắng nghe với VAD tối ưu"""
         if self.is_listening:
             return False
         self.speech_callback = speech_callback
         success = self.vad_processor.start_stream(self._on_speech_detected)
         if success:
             self.is_listening = True
             self.is_processing = False
+            # Bắt đầu thread xử lý response
+            threading.Thread(target=self._process_response_queue, daemon=True).start()
+            print("🎙️ Đã bắt đầu lắng nghe với VAD tối ưu")
         return success
     def stop_listening(self):
             self.audio_buffer = []
         print("🛑 Đã dừng lắng nghe")
     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
+        """Callback khi VAD phát hiện speech - TỐI ƯU HÓA"""
+        print(f"🎯 VAD phát hiện speech: {len(speech_audio)/sample_rate:.2f}s")
+        # Thêm vào queue thay vì xử lý trực tiếp
+        self.response_queue.put((speech_audio, sample_rate))
+    def _process_response_queue(self):
+        """Xử lý tuần tự các request từ queue"""
+        while self.is_listening:
+            try:
+                # Chờ item từ queue với timeout
+                speech_audio, sample_rate = self.response_queue.get(timeout=1.0)
+                # Xử lý speech
+                self._process_speech_segment(speech_audio, sample_rate)
+                # Đánh dấu task hoàn thành
+                self.response_queue.task_done()
+            except queue.Empty:
+                continue
+            except Exception as e:
+                print(f"❌ Lỗi trong response queue: {e}")
+                continue
+    def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
+        """Xử lý speech segment - TỐI ƯU HÓA"""
+        # Kiểm tra nếu đang xử lý
         if self.is_processing:
+            print("⚠️ Bỏ qua speech segment - đang xử lý request trước")
             return
+        with self.processing_lock:
+            self.is_processing = True
         try:
             # Chuyển đổi speech thành text
             transcription = self._transcribe_audio(speech_audio, sample_rate)
             if not transcription or len(transcription.strip()) < 2:
                 print("⚠️ Transcription quá ngắn hoặc trống")
                 return
+            print(f"📝 Transcription: {transcription}")
             self.current_transcription = transcription
             # Tạo phản hồi AI
                 })
         except Exception as e:
+            print(f"❌ Lỗi xử lý speech segment: {e}")
+            traceback.print_exc()
         finally:
+            with self.processing_lock:
+                self.is_processing = False
+    def process_audio_chunk(self, audio_data: tuple) -> Dict[str, Any]:
+        """Xử lý audio chunk với VAD"""
+        if not audio_data or not self.is_listening:
             return {
+                'transcription': "Đang lắng nghe...",
                 'response': "",
                 'tts_audio': None,
+                'status': 'listening'
             }
         try:
             sample_rate, audio_array = audio_data
+            # Thêm vào buffer và xử lý với VAD
+            with self.buffer_lock:
+                self.audio_buffer.extend(audio_array)
+                # Giới hạn buffer
+                max_buffer_samples = sample_rate * 15  # 15 giây
+                if len(self.audio_buffer) > max_buffer_samples:
+                    self.audio_buffer = self.audio_buffer[-max_buffer_samples:]
+            # Xử lý với VAD
+            self.vad_processor.process_stream(audio_array, sample_rate)
             return {
+                'transcription': "Đang lắng nghe...",
+                'response': "",
+                'tts_audio': None,
+                'status': 'listening'
             }
         except Exception as e:
+            print(f"❌ Lỗi xử lý audio chunk: {e}")
             return {
+                'transcription': "",
+                'response': "",
                 'tts_audio': None,
                 'status': 'error'
             }
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
+        """Chuyển audio -> text với xử lý cải tiến"""
         try:
+            # Đảm bảo kiểu dữ liệu và chuẩn hóa
             if audio_data.dtype != np.int16:
                 if audio_data.dtype in [np.float32, np.float64]:
                     audio_data = (audio_data * 32767).astype(np.int16)
             # Chuẩn hóa audio data
             if audio_data.ndim > 1:
+                audio_data = np.mean(audio_data, axis=1).astype(np.int16)
+            # Resample nếu cần
             target_sample_rate = 16000
             if sample_rate != target_sample_rate:
                 audio_data = self._resample_audio(audio_data, sample_rate, target_sample_rate)
                 sample_rate = target_sample_rate
             # Giới hạn độ dài audio
+            max_duration = 15  # giây
             max_samples = sample_rate * max_duration
             if len(audio_data) > max_samples:
                 audio_data = audio_data[:max_samples]
             # Đảm bảo audio đủ dài
+            min_duration = 0.8  # giây
             min_samples = int(sample_rate * min_duration)
             if len(audio_data) < min_samples:
                 padding = np.zeros(min_samples - len(audio_data), dtype=np.int16)
                 audio_data = np.concatenate([audio_data, padding])
             print(f"🔊 Gửi audio đến Whisper: {len(audio_data)} samples, {sample_rate}Hz")
             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
+            # Gọi API Whisper
             try:
                 transcription = self.client.audio.transcriptions.create(
                     model=settings.WHISPER_MODEL,
                     language="vi",
                     temperature=0.0,
                 )
             except Exception as e:
                 print(f"❌ Lỗi Whisper API: {e}")
                 return None
             else:
                 result = str(transcription).strip()
+            print(f"✅ Transcription: '{result}'")
             return result
         except Exception as e:
             print(f"❌ Lỗi transcription: {e}")
             return None
     def _generate_ai_response(self, user_input: str) -> str:
         """Sinh phản hồi AI với xử lý lỗi"""
         try:
 """
             messages = [{"role": "system", "content": system_prompt}]
+            # Giữ lại 6 tin nhắn gần nhất
+            messages.extend(self.conversation_history[-6:])
             completion = self.client.chat.completions.create(
                 model="llama-3.1-8b-instant",
             self.conversation_history.append({"role": "assistant", "content": response})
             # Giới hạn lịch sử
+            if len(self.conversation_history) > 12:
+                self.conversation_history = self.conversation_history[-12:]
             return response
             return "Xin lỗi, tôi gặp lỗi khi tạo phản hồi. Vui lòng thử lại."
     def _text_to_speech(self, text: str) -> Optional[str]:
+        """Chuyển văn bản thành giọng nói"""
         try:
             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
                 return None
             print(f"❌ Lỗi TTS: {e}")
         return None
+    def _resample_audio(self, audio_data: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+        """Resample audio"""
+        try:
+            from scipy import signal
+            duration = len(audio_data) / orig_sr
+            new_length = int(duration * target_sr)
+            resampled_audio = signal.resample(audio_data, new_length)
+            return np.clip(resampled_audio, -32768, 32767).astype(np.int16)
+        except Exception:
+            return audio_data
     def clear_conversation(self):
         """Xóa lịch sử hội thoại"""
         self.conversation_history = []
             'is_processing': self.is_processing,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription,
+            'queue_size': self.response_queue.qsize(),
             'last_update': time.strftime("%H:%M:%S")
         }