Spaces:
Paused
Paused
fix:all segments to check in lang detection
Browse files- requirements.txt +1 -1
- whisper_streaming_custom/backends.py +12 -7
requirements.txt
CHANGED
|
@@ -5,7 +5,7 @@ numpy>=1.21.0
|
|
| 5 |
ffmpeg-python>=0.2.0
|
| 6 |
torch>=2.0.0
|
| 7 |
torchaudio>=2.0.0
|
| 8 |
-
faster-whisper
|
| 9 |
websockets>=10.0
|
| 10 |
pydantic>=1.8.0
|
| 11 |
python-dotenv>=0.19.0
|
|
|
|
| 5 |
ffmpeg-python>=0.2.0
|
| 6 |
torch>=2.0.0
|
| 7 |
torchaudio>=2.0.0
|
| 8 |
+
faster-whisper
|
| 9 |
websockets>=10.0
|
| 10 |
pydantic>=1.8.0
|
| 11 |
python-dotenv>=0.19.0
|
whisper_streaming_custom/backends.py
CHANGED
|
@@ -141,6 +141,8 @@ class FasterWhisperASR(ASRBase):
|
|
| 141 |
device = "cuda" if torch and torch.cuda.is_available() else "cpu"
|
| 142 |
compute_type = "float16" if device == "cuda" else "float32"
|
| 143 |
|
|
|
|
|
|
|
| 144 |
model = WhisperModel(
|
| 145 |
model_size_or_path,
|
| 146 |
device=device,
|
|
@@ -152,7 +154,7 @@ class FasterWhisperASR(ASRBase):
|
|
| 152 |
def transcribe(self, audio: np.ndarray, init_prompt: str = "") -> list:
|
| 153 |
segments, info = self.model.transcribe(
|
| 154 |
audio,
|
| 155 |
-
language=
|
| 156 |
initial_prompt=init_prompt,
|
| 157 |
beam_size=5,
|
| 158 |
word_timestamps=True,
|
|
@@ -181,6 +183,8 @@ class FasterWhisperASR(ASRBase):
|
|
| 181 |
self.transcribe_kargs["task"] = "translate"
|
| 182 |
|
| 183 |
def detect_language(self, audio_file_path):
|
|
|
|
|
|
|
| 184 |
"""
|
| 185 |
Detect the language of the audio using faster-whisper's language detection.
|
| 186 |
|
|
@@ -194,17 +198,18 @@ class FasterWhisperASR(ASRBase):
|
|
| 194 |
- probabilities (dict): Dictionary of language probabilities
|
| 195 |
"""
|
| 196 |
try:
|
| 197 |
-
|
| 198 |
-
audio, sr = sf.read(audio_file_path)
|
| 199 |
-
|
| 200 |
-
# Convert to format expected by Whisper (16-bit PCM)
|
| 201 |
-
audio = (audio * 32768).astype(np.int16)
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# Use faster-whisper's detect_language method
|
| 204 |
language, language_probability, all_language_probs = self.model.detect_language(
|
| 205 |
audio=audio,
|
| 206 |
vad_filter=False, # Disable VAD for language detection
|
| 207 |
-
language_detection_segments=
|
| 208 |
language_detection_threshold=0.5 # Default threshold
|
| 209 |
)
|
| 210 |
|
|
|
|
| 141 |
device = "cuda" if torch and torch.cuda.is_available() else "cpu"
|
| 142 |
compute_type = "float16" if device == "cuda" else "float32"
|
| 143 |
|
| 144 |
+
logger.info(f"Loading whisper model {model_size_or_path} on {device} with compute type {compute_type}")
|
| 145 |
+
|
| 146 |
model = WhisperModel(
|
| 147 |
model_size_or_path,
|
| 148 |
device=device,
|
|
|
|
| 154 |
def transcribe(self, audio: np.ndarray, init_prompt: str = "") -> list:
|
| 155 |
segments, info = self.model.transcribe(
|
| 156 |
audio,
|
| 157 |
+
language=None,
|
| 158 |
initial_prompt=init_prompt,
|
| 159 |
beam_size=5,
|
| 160 |
word_timestamps=True,
|
|
|
|
| 183 |
self.transcribe_kargs["task"] = "translate"
|
| 184 |
|
| 185 |
def detect_language(self, audio_file_path):
|
| 186 |
+
|
| 187 |
+
from faster_whisper.audio import decode_audio
|
| 188 |
"""
|
| 189 |
Detect the language of the audio using faster-whisper's language detection.
|
| 190 |
|
|
|
|
| 198 |
- probabilities (dict): Dictionary of language probabilities
|
| 199 |
"""
|
| 200 |
try:
|
| 201 |
+
audio = decode_audio(audio_file_path, sampling_rate=self.model.feature_extractor.sampling_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
+
# Calculate total number of segments (each segment is 30 seconds)
|
| 204 |
+
audio_duration = len(audio) / self.model.feature_extractor.sampling_rate
|
| 205 |
+
segments_num = max(1, int(audio_duration / 30)) # At least 1 segment
|
| 206 |
+
logger.info(f"Audio duration: {audio_duration:.2f}s, using {segments_num} segments for language detection")
|
| 207 |
+
|
| 208 |
# Use faster-whisper's detect_language method
|
| 209 |
language, language_probability, all_language_probs = self.model.detect_language(
|
| 210 |
audio=audio,
|
| 211 |
vad_filter=False, # Disable VAD for language detection
|
| 212 |
+
language_detection_segments=segments_num, # Use all possible segments
|
| 213 |
language_detection_threshold=0.5 # Default threshold
|
| 214 |
)
|
| 215 |
|