Spaces:

datbkpro
/

voicebot

Running

App Files Files Community

datbkpro commited on 16 days ago

Commit

2f406aa

verified ·

1 Parent(s): cc8629b

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +110 -97

services/streaming_voice_service.py CHANGED Viewed

@@ -8,7 +8,6 @@ from typing import Optional, Dict, Any
 from config.settings import settings
 from core.rag_system import EnhancedRAGSystem
 from core.tts_service import EnhancedTTSService
-from core.speechbrain_vad import SpeechBrainVAD  # THÊM IMPORT
 class StreamingVoiceService:
@@ -17,85 +16,12 @@ class StreamingVoiceService:
         self.rag_system = rag_system
         self.tts_service = tts_service
-        # Khởi tạo VAD
-        self.vad_processor = SpeechBrainVAD()
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
-        self.is_listening = False
-    def start_listening(self) -> bool:
-        """Bắt đầu lắng nghe với VAD"""
-        if self.is_listening:
-            return False
-        success = self.vad_processor.start_stream(self._on_speech_detected)
-        if success:
-            self.is_listening = True
-            print("🎙️ Đã bắt đầu lắng nghe với VAD")
-        return success
-    def stop_listening(self):
-        """Dừng lắng nghe"""
-        self.vad_processor.stop_stream()
-        self.is_listening = False
-        print("🛑 Đã dừng lắng nghe")
-    def process_audio_chunk(self, audio_data: tuple) -> Dict[str, Any]:
-        """Xử lý audio chunk với VAD (dùng cho real-time streaming)"""
-        if not audio_data or not self.is_listening:
-            return {
-                'transcription': "",
-                'response': "",
-                'tts_audio': None
-            }
-        try:
-            sample_rate, audio_array = audio_data
-            # Xử lý với VAD
-            self.vad_processor.process_stream(audio_array, sample_rate)
-            return {
-                'transcription': "Đang lắng nghe...",
-                'response': "",
-                'tts_audio': None
-            }
-        except Exception as e:
-            print(f"❌ Lỗi xử lý audio chunk: {e}")
-            return {
-                'transcription': "",
-                'response': "",
-                'tts_audio': None
-            }
-    def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
-        """Callback khi VAD phát hiện speech"""
-        print(f"🎯 VAD phát hiện speech segment: {len(speech_audio)/sample_rate:.2f}s")
-        # Chuyển đổi speech thành text
-        transcription = self._transcribe_audio(speech_audio, sample_rate)
-        if not transcription or len(transcription.strip()) < 2:
-            print("⚠️ Transcription quá ngắn hoặc trống")
-            return
-        print(f"📝 VAD Transcription: {transcription}")
-        self.current_transcription = transcription
-        # Tạo phản hồi AI
-        response = self._generate_ai_response(transcription)
-        # Tạo TTS
-        tts_audio_path = self._text_to_speech(response)
-        # Có thể gửi kết quả đến UI thông qua callback
-        # (cần tích hợp với Gradio events)
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
-        """Xử lý audio streaming (phương thức cũ cho compatibility)"""
         if not audio_data:
             return {
                 'transcription': "❌ Không có dữ liệu âm thanh",
@@ -104,29 +30,58 @@ class StreamingVoiceService:
             }
         try:
             sample_rate, audio_array = audio_data
             print(f"🎯 Nhận audio: {len(audio_array)} samples, SR: {sample_rate}")
-            # Sử dụng VAD để kiểm tra speech
-            if not self.vad_processor.is_speech(audio_array, sample_rate):
                 return {
-                    'transcription': "❌ Không phát hiện giọng nói",
-                    'response': "Vui lòng nói rõ hơn",
                     'tts_audio': None
                 }
             # Chuyển đổi thành văn bản
             transcription = self._transcribe_audio(audio_array, sample_rate)
-            if not transcription or len(transcription.strip()) < 2:
                 return {
                     'transcription': "❌ Không nghe rõ",
                     'response': "Xin vui lòng nói lại rõ hơn",
                     'tts_audio': None
                 }
             print(f"📝 Đã chuyển đổi: {transcription}")
             self.current_transcription = transcription
             # Tạo phản hồi AI
@@ -143,38 +98,57 @@ class StreamingVoiceService:
         except Exception as e:
             print(f"❌ Lỗi xử lý streaming audio: {e}")
             return {
                 'transcription': f"❌ Lỗi: {str(e)}",
-                'response': "Xin lỗi, có lỗi xảy ra",
                 'tts_audio': None
             }
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
-        """Chuyển audio -> text (giữ nguyên)"""
-        # ... giữ nguyên code cũ ...
         try:
             if audio_data.ndim > 1:
-                audio_data = np.mean(audio_data, axis=1)
-            audio_max = np.max(np.abs(audio_data))
-            if audio_max > 0.1:
-                audio_data = audio_data / audio_max * 0.9
-            max_duration = 15
             max_samples = sample_rate * max_duration
             if len(audio_data) > max_samples:
                 audio_data = audio_data[:max_samples]
-            min_duration = 1.0
-            min_samples = sample_rate * min_duration
             if len(audio_data) < min_samples:
-                padding = np.zeros(min_samples - len(audio_data))
                 audio_data = np.concatenate([audio_data, padding])
             buffer = io.BytesIO()
             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
             transcription = self.client.audio.transcriptions.create(
                 model=settings.WHISPER_MODEL,
                 file=("speech.wav", buffer.read(), "audio/wav"),
@@ -183,6 +157,7 @@ class StreamingVoiceService:
                 temperature=0.0,
             )
             if hasattr(transcription, 'text'):
                 result = transcription.text.strip()
             elif isinstance(transcription, str):
@@ -190,28 +165,65 @@ class StreamingVoiceService:
             else:
                 result = str(transcription).strip()
             return result
         except Exception as e:
             print(f"❌ Lỗi transcription: {e}")
             return None
     def _generate_ai_response(self, user_input: str) -> str:
-        """Sinh phản hồi AI (giữ nguyên)"""
-        # ... giữ nguyên code cũ ...
         try:
             self.conversation_history.append({"role": "user", "content": user_input})
             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
             context_text = "\n".join([f"- {result.get('text', str(result))}" for result in rag_results]) if rag_results else ""
             system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
-Hãy trả lời ngắn gọn, tự nhiên và hữu ích.
 Thông tin tham khảo:
 {context_text}
 """
             messages = [{"role": "system", "content": system_prompt}]
             messages.extend(self.conversation_history[-4:])
             completion = self.client.chat.completions.create(
@@ -224,16 +236,17 @@ Thông tin tham khảo:
             response = completion.choices[0].message.content
             self.conversation_history.append({"role": "assistant", "content": response})
             if len(self.conversation_history) > 8:
                 self.conversation_history = self.conversation_history[-8:]
             return response
         except Exception as e:
-            return f"Xin lỗi, tôi gặp lỗi: {str(e)}"
     def _text_to_speech(self, text: str) -> Optional[str]:
-        """Chuyển văn bản thành giọng nói (giữ nguyên)"""
         try:
             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
                 return None
@@ -241,6 +254,7 @@ Thông tin tham khảo:
             tts_bytes = self.tts_service.text_to_speech(text, 'vi')
             if tts_bytes:
                 audio_path = self.tts_service.save_audio_to_file(tts_bytes)
                 return audio_path
         except Exception as e:
             print(f"❌ Lỗi TTS: {e}")
@@ -255,7 +269,6 @@ Thông tin tham khảo:
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {
-            'is_listening': self.is_listening,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription,
             'last_update': time.strftime("%H:%M:%S")

 from config.settings import settings
 from core.rag_system import EnhancedRAGSystem
 from core.tts_service import EnhancedTTSService
 class StreamingVoiceService:
         self.rag_system = rag_system
         self.tts_service = tts_service
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
+        """Xử lý audio streaming từ Gradio microphone component"""
         if not audio_data:
             return {
                 'transcription': "❌ Không có dữ liệu âm thanh",
             }
         try:
+            # Lấy dữ liệu audio từ Gradio
             sample_rate, audio_array = audio_data
             print(f"🎯 Nhận audio: {len(audio_array)} samples, SR: {sample_rate}")
+            # Kiểm tra kiểu dữ liệu và chuyển đổi nếu cần
+            if isinstance(audio_array, np.ndarray):
+                if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
+                    # Chuyển từ float sang int16
+                    audio_array = (audio_array * 32767).astype(np.int16)
+            # Kiểm tra audio có dữ liệu không
+            if len(audio_array) == 0:
+                return {
+                    'transcription': "❌ Âm thanh trống",
+                    'response': "Vui lòng nói lại",
+                    'tts_audio': None
+                }
+            # Tính toán âm lượng
+            audio_abs = np.abs(audio_array.astype(np.float32))
+            audio_rms = np.sqrt(np.mean(audio_abs**2)) / 32767.0
+            print(f"📊 Âm lượng RMS: {audio_rms:.4f}")
+            if audio_rms < 0.005:
                 return {
+                    'transcription': "❌ Âm thanh quá yếu",
+                    'response': "Xin vui lòng nói to hơn",
                     'tts_audio': None
                 }
             # Chuyển đổi thành văn bản
             transcription = self._transcribe_audio(audio_array, sample_rate)
+            if not transcription or len(transcription.strip()) == 0:
                 return {
                     'transcription': "❌ Không nghe rõ",
                     'response': "Xin vui lòng nói lại rõ hơn",
                     'tts_audio': None
                 }
+            # Kiểm tra nếu transcription quá ngắn
+            if len(transcription.strip()) < 2:
+                return {
+                    'transcription': "❌ Câu nói quá ngắn",
+                    'response': "Xin vui lòng nói câu dài hơn",
+                    'tts_audio': None
+                }
             print(f"📝 Đã chuyển đổi: {transcription}")
+            # Cập nhật transcription hiện tại
             self.current_transcription = transcription
             # Tạo phản hồi AI
         except Exception as e:
             print(f"❌ Lỗi xử lý streaming audio: {e}")
+            print(f"Chi tiết lỗi: {traceback.format_exc()}")
             return {
                 'transcription': f"❌ Lỗi: {str(e)}",
+                'response': "Xin lỗi, có lỗi xảy ra trong quá trình xử lý",
                 'tts_audio': None
             }
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
+        """Chuyển audio -> text với xử lý sample rate"""
         try:
+            # Đảm bảo kiểu dữ liệu là int16
+            if audio_data.dtype != np.int16:
+                if audio_data.dtype in [np.float32, np.float64]:
+                    audio_data = (audio_data * 32767).astype(np.int16)
+                else:
+                    audio_data = audio_data.astype(np.int16)
+            # Chuẩn hóa audio data
             if audio_data.ndim > 1:
+                audio_data = np.mean(audio_data, axis=1).astype(np.int16)  # Chuyển sang mono
+            # Resample nếu sample rate không phải 16000Hz (Whisper yêu cầu)
+            target_sample_rate = 16000
+            if sample_rate != target_sample_rate:
+                audio_data = self._resample_audio(audio_data, sample_rate, target_sample_rate)
+                sample_rate = target_sample_rate
+                print(f"🔄 Đã resample từ {sample_rate}Hz xuống {target_sample_rate}Hz")
+            # Giới hạn độ dài audio
+            max_duration = 10  # giây
             max_samples = sample_rate * max_duration
             if len(audio_data) > max_samples:
                 audio_data = audio_data[:max_samples]
+                print(f"⚠️ Cắt audio xuống còn {max_duration} giây")
+            # Đảm bảo audio đủ dài
+            min_duration = 0.5  # giây
+            min_samples = int(sample_rate * min_duration)
             if len(audio_data) < min_samples:
+                # Pad audio nếu quá ngắn
+                padding = np.zeros(min_samples - len(audio_data), dtype=np.int16)
                 audio_data = np.concatenate([audio_data, padding])
+                print(f"⚠️ Đã pad audio lên {min_duration} giây")
+            print(f"🔊 Gửi audio đến Whisper: {len(audio_data)} samples, {sample_rate}Hz")
             buffer = io.BytesIO()
             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
+            # Gọi API Whisper
             transcription = self.client.audio.transcriptions.create(
                 model=settings.WHISPER_MODEL,
                 file=("speech.wav", buffer.read(), "audio/wav"),
                 temperature=0.0,
             )
+            # Xử lý response
             if hasattr(transcription, 'text'):
                 result = transcription.text.strip()
             elif isinstance(transcription, str):
             else:
                 result = str(transcription).strip()
+            print(f"✅ Transcription thành công: '{result}'")
             return result
         except Exception as e:
             print(f"❌ Lỗi transcription: {e}")
+            print(f"Audio details: dtype={audio_data.dtype}, shape={audio_data.shape}, sr={sample_rate}")
             return None
+    def _resample_audio(self, audio_data: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+        """Resample audio sử dụng scipy"""
+        try:
+            from scipy import signal
+            # Tính số samples mới
+            duration = len(audio_data) / orig_sr
+            new_length = int(duration * target_sr)
+            # Resample sử dụng scipy.signal.resample
+            resampled_audio = signal.resample(audio_data, new_length)
+            # Chuyển lại về int16
+            resampled_audio = resampled_audio.astype(np.int16)
+            return resampled_audio
+        except ImportError:
+            print("⚠️ Không có scipy, sử dụng simple resampling")
+            # Simple resampling bằng interpolation
+            orig_length = len(audio_data)
+            new_length = int(orig_length * target_sr / orig_sr)
+            # Linear interpolation
+            x_old = np.linspace(0, 1, orig_length)
+            x_new = np.linspace(0, 1, new_length)
+            resampled_audio = np.interp(x_new, x_old, audio_data).astype(np.int16)
+            return resampled_audio
+        except Exception as e:
+            print(f"❌ Lỗi resample: {e}")
+            return audio_data
     def _generate_ai_response(self, user_input: str) -> str:
+        """Sinh phản hồi AI"""
         try:
+            # Thêm vào lịch sử
             self.conversation_history.append({"role": "user", "content": user_input})
+            # Tìm kiếm RAG
             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
             context_text = "\n".join([f"- {result.get('text', str(result))}" for result in rag_results]) if rag_results else ""
             system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
+Hãy trả lời ngắn gọn, tự nhiên và hữu ích (dưới 100 từ).
 Thông tin tham khảo:
 {context_text}
 """
             messages = [{"role": "system", "content": system_prompt}]
+            # Giữ lại 4 tin nhắn gần nhất
             messages.extend(self.conversation_history[-4:])
             completion = self.client.chat.completions.create(
             response = completion.choices[0].message.content
             self.conversation_history.append({"role": "assistant", "content": response})
+            # Giới hạn lịch sử
             if len(self.conversation_history) > 8:
                 self.conversation_history = self.conversation_history[-8:]
             return response
         except Exception as e:
+            return f"Xin lỗi, tôi gặp lỗi khi tạo phản hồi: {str(e)}"
     def _text_to_speech(self, text: str) -> Optional[str]:
+        """Chuyển văn bản thành giọng nói"""
         try:
             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
                 return None
             tts_bytes = self.tts_service.text_to_speech(text, 'vi')
             if tts_bytes:
                 audio_path = self.tts_service.save_audio_to_file(tts_bytes)
+                print(f"✅ Đã tạo TTS: {audio_path}")
                 return audio_path
         except Exception as e:
             print(f"❌ Lỗi TTS: {e}")
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription,
             'last_update': time.strftime("%H:%M:%S")