Spaces:
Running
Running
Update src/worker.js
Browse files- src/worker.js +177 -102
src/worker.js
CHANGED
|
@@ -26,11 +26,24 @@ import {
|
|
| 26 |
MIN_SPEECH_DURATION_SAMPLES,
|
| 27 |
} from "./constants";
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
|
| 30 |
let voice;
|
| 31 |
const tts = await KokoroTTS.from_pretrained(model_id, {
|
| 32 |
-
dtype: "
|
| 33 |
device: "webgpu",
|
|
|
|
|
|
|
|
|
|
| 34 |
});
|
| 35 |
|
| 36 |
const device = "webgpu";
|
|
@@ -41,18 +54,19 @@ self.postMessage({
|
|
| 41 |
duration: "until_next",
|
| 42 |
});
|
| 43 |
|
| 44 |
-
// Load
|
| 45 |
const silero_vad = await AutoModel.from_pretrained(
|
| 46 |
"onnx-community/silero-vad",
|
| 47 |
{
|
| 48 |
config: { model_type: "custom" },
|
| 49 |
-
dtype: "fp32",
|
| 50 |
},
|
| 51 |
).catch((error) => {
|
| 52 |
-
self.postMessage({ error });
|
| 53 |
throw error;
|
| 54 |
});
|
| 55 |
|
|
|
|
| 56 |
const DEVICE_DTYPE_CONFIGS = {
|
| 57 |
webgpu: {
|
| 58 |
encoder_model: "fp32",
|
|
@@ -63,37 +77,66 @@ const DEVICE_DTYPE_CONFIGS = {
|
|
| 63 |
decoder_model_merged: "q8",
|
| 64 |
},
|
| 65 |
};
|
|
|
|
| 66 |
const transcriber = await pipeline(
|
| 67 |
"automatic-speech-recognition",
|
| 68 |
-
"onnx-community/whisper-base",
|
| 69 |
{
|
| 70 |
device,
|
| 71 |
dtype: DEVICE_DTYPE_CONFIGS[device],
|
|
|
|
|
|
|
|
|
|
| 72 |
},
|
| 73 |
).catch((error) => {
|
| 74 |
-
self.postMessage({ error });
|
| 75 |
throw error;
|
| 76 |
});
|
| 77 |
|
| 78 |
-
|
|
|
|
| 79 |
|
| 80 |
-
|
| 81 |
-
const
|
| 82 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
dtype: "q4f16",
|
| 84 |
device: "webgpu",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
});
|
| 86 |
|
|
|
|
| 87 |
const SYSTEM_MESSAGE = {
|
| 88 |
role: "system",
|
| 89 |
content:
|
| 90 |
-
"You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual.",
|
| 91 |
};
|
| 92 |
-
await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 }); // Compile shaders
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
let messages = [SYSTEM_MESSAGE];
|
| 95 |
let past_key_values_cache;
|
| 96 |
let stopping_criteria;
|
|
|
|
|
|
|
|
|
|
| 97 |
self.postMessage({
|
| 98 |
type: "status",
|
| 99 |
status: "ready",
|
|
@@ -101,17 +144,17 @@ self.postMessage({
|
|
| 101 |
voices: tts.voices,
|
| 102 |
});
|
| 103 |
|
| 104 |
-
//
|
| 105 |
const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
|
| 106 |
let bufferPointer = 0;
|
| 107 |
|
| 108 |
-
//
|
| 109 |
const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
|
| 110 |
let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
|
| 111 |
|
| 112 |
-
//
|
| 113 |
let isRecording = false;
|
| 114 |
-
let isPlaying = false;
|
| 115 |
|
| 116 |
/**
|
| 117 |
* Perform Voice Activity Detection (VAD)
|
|
@@ -122,86 +165,126 @@ async function vad(buffer) {
|
|
| 122 |
const input = new Tensor("float32", buffer, [1, buffer.length]);
|
| 123 |
|
| 124 |
const { stateN, output } = await silero_vad({ input, sr, state });
|
| 125 |
-
state = stateN;
|
| 126 |
|
| 127 |
const isSpeech = output.data[0];
|
| 128 |
|
| 129 |
-
// Use heuristics to determine if the buffer is speech or not
|
| 130 |
return (
|
| 131 |
-
// Case 1: We are above the threshold (definitely speech)
|
| 132 |
isSpeech > SPEECH_THRESHOLD ||
|
| 133 |
-
// Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
|
| 134 |
(isRecording && isSpeech >= EXIT_THRESHOLD)
|
| 135 |
);
|
| 136 |
}
|
| 137 |
|
| 138 |
/**
|
| 139 |
-
*
|
| 140 |
* @param {Float32Array} buffer The audio buffer
|
| 141 |
-
* @param {Object} data Additional data
|
| 142 |
*/
|
| 143 |
const speechToSpeech = async (buffer, data) => {
|
| 144 |
isPlaying = true;
|
| 145 |
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
// Set up text-to-speech streaming
|
| 155 |
-
const splitter = new TextSplitterStream();
|
| 156 |
-
const stream = tts.stream(splitter, {
|
| 157 |
-
voice,
|
| 158 |
-
});
|
| 159 |
-
(async () => {
|
| 160 |
-
for await (const { text, phonemes, audio } of stream) {
|
| 161 |
-
self.postMessage({ type: "output", text, result: audio });
|
| 162 |
}
|
| 163 |
-
})();
|
| 164 |
-
|
| 165 |
-
// 2. Generate a response using the LLM
|
| 166 |
-
const inputs = tokenizer.apply_chat_template(messages, {
|
| 167 |
-
add_generation_prompt: true,
|
| 168 |
-
return_dict: true,
|
| 169 |
-
});
|
| 170 |
-
const streamer = new TextStreamer(tokenizer, {
|
| 171 |
-
skip_prompt: true,
|
| 172 |
-
skip_special_tokens: true,
|
| 173 |
-
callback_function: (text) => {
|
| 174 |
-
splitter.push(text);
|
| 175 |
-
},
|
| 176 |
-
token_callback_function: () => {},
|
| 177 |
-
});
|
| 178 |
-
|
| 179 |
-
stopping_criteria = new InterruptableStoppingCriteria();
|
| 180 |
-
const { past_key_values, sequences } = await llm.generate({
|
| 181 |
-
...inputs,
|
| 182 |
-
past_key_values: past_key_values_cache,
|
| 183 |
-
|
| 184 |
-
do_sample: false, // TODO: do_sample: true is bugged (invalid data location on topk sample)
|
| 185 |
-
max_new_tokens: 1024,
|
| 186 |
-
streamer,
|
| 187 |
-
stopping_criteria,
|
| 188 |
-
return_dict_in_generate: true,
|
| 189 |
-
});
|
| 190 |
-
past_key_values_cache = past_key_values;
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
|
|
|
| 199 |
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
};
|
| 202 |
|
| 203 |
-
//
|
| 204 |
let postSpeechSamples = 0;
|
|
|
|
|
|
|
| 205 |
const resetAfterRecording = (offset = 0) => {
|
| 206 |
self.postMessage({
|
| 207 |
type: "status",
|
|
@@ -216,39 +299,39 @@ const resetAfterRecording = (offset = 0) => {
|
|
| 216 |
};
|
| 217 |
|
| 218 |
const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
|
| 219 |
-
// Get start and end time of the speech segment, minus the padding
|
| 220 |
const now = Date.now();
|
| 221 |
-
const end =
|
| 222 |
-
now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
|
| 223 |
const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
|
| 224 |
const duration = end - start;
|
| 225 |
const overflowLength = overflow?.length ?? 0;
|
| 226 |
|
| 227 |
-
//
|
| 228 |
const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
|
| 229 |
-
|
| 230 |
const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
|
| 231 |
const paddedBuffer = new Float32Array(prevLength + buffer.length);
|
|
|
|
| 232 |
let offset = 0;
|
| 233 |
for (const prev of prevBuffers) {
|
| 234 |
paddedBuffer.set(prev, offset);
|
| 235 |
offset += prev.length;
|
| 236 |
}
|
| 237 |
paddedBuffer.set(buffer, offset);
|
|
|
|
|
|
|
| 238 |
speechToSpeech(paddedBuffer, { start, end, duration });
|
| 239 |
|
| 240 |
-
//
|
| 241 |
if (overflow) {
|
| 242 |
BUFFER.set(overflow, 0);
|
| 243 |
}
|
| 244 |
resetAfterRecording(overflowLength);
|
| 245 |
};
|
| 246 |
|
| 247 |
-
|
| 248 |
self.onmessage = async (event) => {
|
| 249 |
const { type, buffer } = event.data;
|
| 250 |
|
| 251 |
-
//
|
| 252 |
if (type === "audio" && isPlaying) return;
|
| 253 |
|
| 254 |
switch (type) {
|
|
@@ -260,6 +343,7 @@ self.onmessage = async (event) => {
|
|
| 260 |
case "end_call":
|
| 261 |
messages = [SYSTEM_MESSAGE];
|
| 262 |
past_key_values_cache = null;
|
|
|
|
| 263 |
case "interrupt":
|
| 264 |
stopping_criteria?.interrupt();
|
| 265 |
return;
|
|
@@ -271,15 +355,13 @@ self.onmessage = async (event) => {
|
|
| 271 |
return;
|
| 272 |
}
|
| 273 |
|
| 274 |
-
|
|
|
|
| 275 |
const isSpeech = await vad(buffer);
|
| 276 |
|
| 277 |
if (!wasRecording && !isSpeech) {
|
| 278 |
-
//
|
| 279 |
-
// so we will probably discard the buffer. So, we insert
|
| 280 |
-
// into a FIFO queue with maximum size of PREV_BUFFER_SIZE
|
| 281 |
if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
|
| 282 |
-
// If the queue is full, we discard the oldest buffer
|
| 283 |
prevBuffers.shift();
|
| 284 |
}
|
| 285 |
prevBuffers.push(buffer);
|
|
@@ -288,25 +370,21 @@ self.onmessage = async (event) => {
|
|
| 288 |
|
| 289 |
const remaining = BUFFER.length - bufferPointer;
|
| 290 |
if (buffer.length >= remaining) {
|
| 291 |
-
//
|
| 292 |
-
// so we perform transcription and copy the overflow to the global buffer
|
| 293 |
BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
|
| 294 |
bufferPointer += remaining;
|
| 295 |
|
| 296 |
-
// Dispatch the audio buffer
|
| 297 |
const overflow = buffer.subarray(remaining);
|
| 298 |
dispatchForTranscriptionAndResetAudioBuffer(overflow);
|
| 299 |
return;
|
| 300 |
} else {
|
| 301 |
-
//
|
| 302 |
-
// so we copy it to the global buffer
|
| 303 |
BUFFER.set(buffer, bufferPointer);
|
| 304 |
bufferPointer += buffer.length;
|
| 305 |
}
|
| 306 |
|
| 307 |
if (isSpeech) {
|
| 308 |
if (!isRecording) {
|
| 309 |
-
// Indicate start of recording
|
| 310 |
self.postMessage({
|
| 311 |
type: "status",
|
| 312 |
status: "recording_start",
|
|
@@ -314,25 +392,19 @@ self.onmessage = async (event) => {
|
|
| 314 |
duration: "until_next",
|
| 315 |
});
|
| 316 |
}
|
| 317 |
-
// Start or continue recording
|
| 318 |
isRecording = true;
|
| 319 |
-
postSpeechSamples = 0;
|
| 320 |
return;
|
| 321 |
}
|
| 322 |
|
| 323 |
postSpeechSamples += buffer.length;
|
| 324 |
|
| 325 |
-
//
|
| 326 |
-
// So, we check whether we have reached the end of the current audio chunk.
|
| 327 |
if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
|
| 328 |
-
// There was a short pause, but not long enough to consider the end of a speech chunk
|
| 329 |
-
// (e.g., the speaker took a breath), so we continue recording
|
| 330 |
return;
|
| 331 |
}
|
| 332 |
|
| 333 |
if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
|
| 334 |
-
// The entire buffer (including the new chunk) is smaller than the minimum
|
| 335 |
-
// duration of a speech chunk, so we can safely discard the buffer.
|
| 336 |
resetAfterRecording();
|
| 337 |
return;
|
| 338 |
}
|
|
@@ -340,16 +412,19 @@ self.onmessage = async (event) => {
|
|
| 340 |
dispatchForTranscriptionAndResetAudioBuffer();
|
| 341 |
};
|
| 342 |
|
|
|
|
| 343 |
function greet(text) {
|
| 344 |
isPlaying = true;
|
| 345 |
const splitter = new TextSplitterStream();
|
| 346 |
const stream = tts.stream(splitter, { voice });
|
|
|
|
| 347 |
(async () => {
|
| 348 |
for await (const { text: chunkText, audio } of stream) {
|
| 349 |
self.postMessage({ type: "output", text: chunkText, result: audio });
|
| 350 |
}
|
| 351 |
})();
|
|
|
|
| 352 |
splitter.push(text);
|
| 353 |
splitter.close();
|
| 354 |
messages.push({ role: "assistant", content: text });
|
| 355 |
-
}
|
|
|
|
| 26 |
MIN_SPEECH_DURATION_SAMPLES,
|
| 27 |
} from "./constants";
|
| 28 |
|
| 29 |
+
// WebGPU availability check - fail fast
|
| 30 |
+
if (!navigator.gpu) {
|
| 31 |
+
self.postMessage({
|
| 32 |
+
type: "error",
|
| 33 |
+
error: new Error("WebGPU not supported. This app requires Chrome 113+, Edge 113+, or Chrome Canary with WebGPU enabled.")
|
| 34 |
+
});
|
| 35 |
+
throw new Error("WebGPU not available");
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
// TTS Configuration
|
| 39 |
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
|
| 40 |
let voice;
|
| 41 |
const tts = await KokoroTTS.from_pretrained(model_id, {
|
| 42 |
+
dtype: "fp16", // Keep fp16 for memory efficiency
|
| 43 |
device: "webgpu",
|
| 44 |
+
}).catch((error) => {
|
| 45 |
+
self.postMessage({ error: new Error(`TTS loading failed: ${error.message}`) });
|
| 46 |
+
throw error;
|
| 47 |
});
|
| 48 |
|
| 49 |
const device = "webgpu";
|
|
|
|
| 54 |
duration: "until_next",
|
| 55 |
});
|
| 56 |
|
| 57 |
+
// Load VAD model
|
| 58 |
const silero_vad = await AutoModel.from_pretrained(
|
| 59 |
"onnx-community/silero-vad",
|
| 60 |
{
|
| 61 |
config: { model_type: "custom" },
|
| 62 |
+
dtype: "fp32",
|
| 63 |
},
|
| 64 |
).catch((error) => {
|
| 65 |
+
self.postMessage({ error: new Error(`VAD loading failed: ${error.message}`) });
|
| 66 |
throw error;
|
| 67 |
});
|
| 68 |
|
| 69 |
+
// Whisper configuration
|
| 70 |
const DEVICE_DTYPE_CONFIGS = {
|
| 71 |
webgpu: {
|
| 72 |
encoder_model: "fp32",
|
|
|
|
| 77 |
decoder_model_merged: "q8",
|
| 78 |
},
|
| 79 |
};
|
| 80 |
+
|
| 81 |
const transcriber = await pipeline(
|
| 82 |
"automatic-speech-recognition",
|
| 83 |
+
"onnx-community/whisper-base",
|
| 84 |
{
|
| 85 |
device,
|
| 86 |
dtype: DEVICE_DTYPE_CONFIGS[device],
|
| 87 |
+
// Specify language to avoid warnings
|
| 88 |
+
language: "en",
|
| 89 |
+
task: "transcribe",
|
| 90 |
},
|
| 91 |
).catch((error) => {
|
| 92 |
+
self.postMessage({ error: new Error(`Whisper loading failed: ${error.message}`) });
|
| 93 |
throw error;
|
| 94 |
});
|
| 95 |
|
| 96 |
+
// Warm up the transcriber
|
| 97 |
+
await transcriber(new Float32Array(INPUT_SAMPLE_RATE));
|
| 98 |
|
| 99 |
+
// LLM Configuration - Split tokenizer and model sources
|
| 100 |
+
const TOKENIZER_MODEL_ID = "Qwen/Qwen3-1.7B"; // Original repo has tokenizer
|
| 101 |
+
const ONNX_MODEL_ID = "onnx-community/Qwen3-1.7B-ONNX"; // ONNX weights
|
| 102 |
+
|
| 103 |
+
// Load tokenizer from original repo
|
| 104 |
+
const tokenizer = await AutoTokenizer.from_pretrained(TOKENIZER_MODEL_ID).catch((error) => {
|
| 105 |
+
self.postMessage({ error: new Error(`Tokenizer loading failed: ${error.message}`) });
|
| 106 |
+
throw error;
|
| 107 |
+
});
|
| 108 |
+
|
| 109 |
+
// Load ONNX model weights
|
| 110 |
+
const llm = await AutoModelForCausalLM.from_pretrained(ONNX_MODEL_ID, {
|
| 111 |
dtype: "q4f16",
|
| 112 |
device: "webgpu",
|
| 113 |
+
// Add model-specific config for Qwen3
|
| 114 |
+
model_config: {
|
| 115 |
+
use_cache: true,
|
| 116 |
+
attention_bias: false,
|
| 117 |
+
}
|
| 118 |
+
}).catch((error) => {
|
| 119 |
+
self.postMessage({ error: new Error(`LLM loading failed: ${error.message}`) });
|
| 120 |
+
throw error;
|
| 121 |
});
|
| 122 |
|
| 123 |
+
// System prompt optimized for conversational AI
|
| 124 |
const SYSTEM_MESSAGE = {
|
| 125 |
role: "system",
|
| 126 |
content:
|
| 127 |
+
"You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual. Focus on being natural and engaging in conversation.",
|
| 128 |
};
|
|
|
|
| 129 |
|
| 130 |
+
// Warm up the LLM
|
| 131 |
+
await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 });
|
| 132 |
+
|
| 133 |
+
// Conversation state
|
| 134 |
let messages = [SYSTEM_MESSAGE];
|
| 135 |
let past_key_values_cache;
|
| 136 |
let stopping_criteria;
|
| 137 |
+
const MAX_CONTEXT_MESSAGES = 20; // Prevent unbounded memory growth
|
| 138 |
+
|
| 139 |
+
// Send ready signal with available voices
|
| 140 |
self.postMessage({
|
| 141 |
type: "status",
|
| 142 |
status: "ready",
|
|
|
|
| 144 |
voices: tts.voices,
|
| 145 |
});
|
| 146 |
|
| 147 |
+
// Audio processing state
|
| 148 |
const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
|
| 149 |
let bufferPointer = 0;
|
| 150 |
|
| 151 |
+
// VAD state
|
| 152 |
const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
|
| 153 |
let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
|
| 154 |
|
| 155 |
+
// Recording state
|
| 156 |
let isRecording = false;
|
| 157 |
+
let isPlaying = false;
|
| 158 |
|
| 159 |
/**
|
| 160 |
* Perform Voice Activity Detection (VAD)
|
|
|
|
| 165 |
const input = new Tensor("float32", buffer, [1, buffer.length]);
|
| 166 |
|
| 167 |
const { stateN, output } = await silero_vad({ input, sr, state });
|
| 168 |
+
state = stateN;
|
| 169 |
|
| 170 |
const isSpeech = output.data[0];
|
| 171 |
|
|
|
|
| 172 |
return (
|
|
|
|
| 173 |
isSpeech > SPEECH_THRESHOLD ||
|
|
|
|
| 174 |
(isRecording && isSpeech >= EXIT_THRESHOLD)
|
| 175 |
);
|
| 176 |
}
|
| 177 |
|
| 178 |
/**
|
| 179 |
+
* Handle speech-to-speech pipeline
|
| 180 |
* @param {Float32Array} buffer The audio buffer
|
| 181 |
+
* @param {Object} data Additional timing data
|
| 182 |
*/
|
| 183 |
const speechToSpeech = async (buffer, data) => {
|
| 184 |
isPlaying = true;
|
| 185 |
|
| 186 |
+
try {
|
| 187 |
+
// 1. Transcribe audio
|
| 188 |
+
const transcription = await transcriber(buffer);
|
| 189 |
+
const text = transcription.text?.trim() || "";
|
| 190 |
+
|
| 191 |
+
if (!text || text === "[BLANK_AUDIO]") {
|
| 192 |
+
isPlaying = false;
|
| 193 |
+
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
+
// Add user message
|
| 197 |
+
messages.push({ role: "user", content: text });
|
| 198 |
|
| 199 |
+
// Manage context window
|
| 200 |
+
if (messages.length > MAX_CONTEXT_MESSAGES) {
|
| 201 |
+
messages = [SYSTEM_MESSAGE, ...messages.slice(-(MAX_CONTEXT_MESSAGES - 1))];
|
| 202 |
+
past_key_values_cache = null; // Reset cache when context changes
|
| 203 |
+
}
|
| 204 |
|
| 205 |
+
// Set up TTS streaming
|
| 206 |
+
const splitter = new TextSplitterStream();
|
| 207 |
+
const stream = tts.stream(splitter, { voice });
|
| 208 |
+
|
| 209 |
+
// Stream TTS output
|
| 210 |
+
(async () => {
|
| 211 |
+
try {
|
| 212 |
+
for await (const { text, phonemes, audio } of stream) {
|
| 213 |
+
self.postMessage({ type: "output", text, result: audio });
|
| 214 |
+
}
|
| 215 |
+
} catch (error) {
|
| 216 |
+
console.error("TTS streaming error:", error);
|
| 217 |
+
}
|
| 218 |
+
})();
|
| 219 |
+
|
| 220 |
+
// 2. Generate LLM response
|
| 221 |
+
const inputs = tokenizer.apply_chat_template(messages, {
|
| 222 |
+
add_generation_prompt: true,
|
| 223 |
+
return_dict: true,
|
| 224 |
+
// Qwen3 specific - disable thinking mode for conversational use
|
| 225 |
+
enable_thinking: false,
|
| 226 |
+
});
|
| 227 |
+
|
| 228 |
+
const streamer = new TextStreamer(tokenizer, {
|
| 229 |
+
skip_prompt: true,
|
| 230 |
+
skip_special_tokens: true,
|
| 231 |
+
callback_function: (text) => {
|
| 232 |
+
splitter.push(text);
|
| 233 |
+
},
|
| 234 |
+
token_callback_function: () => {},
|
| 235 |
+
});
|
| 236 |
+
|
| 237 |
+
stopping_criteria = new InterruptableStoppingCriteria();
|
| 238 |
+
|
| 239 |
+
// Generate with appropriate settings for Qwen3
|
| 240 |
+
const { past_key_values, sequences } = await llm.generate({
|
| 241 |
+
...inputs,
|
| 242 |
+
past_key_values: past_key_values_cache,
|
| 243 |
+
|
| 244 |
+
// Qwen3 optimal settings for non-thinking mode
|
| 245 |
+
do_sample: true,
|
| 246 |
+
temperature: 0.7,
|
| 247 |
+
top_p: 0.8,
|
| 248 |
+
top_k: 20,
|
| 249 |
+
max_new_tokens: 512, // Keep responses concise for voice
|
| 250 |
+
|
| 251 |
+
streamer,
|
| 252 |
+
stopping_criteria,
|
| 253 |
+
return_dict_in_generate: true,
|
| 254 |
+
|
| 255 |
+
// Ensure proper EOS handling for Qwen3
|
| 256 |
+
eos_token_id: [151643, 151645],
|
| 257 |
+
pad_token_id: tokenizer.pad_token_id,
|
| 258 |
+
});
|
| 259 |
+
|
| 260 |
+
past_key_values_cache = past_key_values;
|
| 261 |
+
|
| 262 |
+
// Close the TTS stream
|
| 263 |
+
splitter.close();
|
| 264 |
+
|
| 265 |
+
// Decode and store assistant response
|
| 266 |
+
const decoded = tokenizer.batch_decode(
|
| 267 |
+
sequences.slice(null, [inputs.input_ids.dims[1], null]),
|
| 268 |
+
{ skip_special_tokens: true },
|
| 269 |
+
);
|
| 270 |
+
|
| 271 |
+
messages.push({ role: "assistant", content: decoded[0] });
|
| 272 |
+
|
| 273 |
+
} catch (error) {
|
| 274 |
+
console.error("Speech-to-speech error:", error);
|
| 275 |
+
self.postMessage({
|
| 276 |
+
type: "error",
|
| 277 |
+
error: new Error(`Processing failed: ${error.message}`)
|
| 278 |
+
});
|
| 279 |
+
} finally {
|
| 280 |
+
isPlaying = false;
|
| 281 |
+
}
|
| 282 |
};
|
| 283 |
|
| 284 |
+
// Audio buffer management
|
| 285 |
let postSpeechSamples = 0;
|
| 286 |
+
let prevBuffers = [];
|
| 287 |
+
|
| 288 |
const resetAfterRecording = (offset = 0) => {
|
| 289 |
self.postMessage({
|
| 290 |
type: "status",
|
|
|
|
| 299 |
};
|
| 300 |
|
| 301 |
const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
|
|
|
|
| 302 |
const now = Date.now();
|
| 303 |
+
const end = now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
|
|
|
|
| 304 |
const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
|
| 305 |
const duration = end - start;
|
| 306 |
const overflowLength = overflow?.length ?? 0;
|
| 307 |
|
| 308 |
+
// Prepare padded buffer
|
| 309 |
const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
|
|
|
|
| 310 |
const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
|
| 311 |
const paddedBuffer = new Float32Array(prevLength + buffer.length);
|
| 312 |
+
|
| 313 |
let offset = 0;
|
| 314 |
for (const prev of prevBuffers) {
|
| 315 |
paddedBuffer.set(prev, offset);
|
| 316 |
offset += prev.length;
|
| 317 |
}
|
| 318 |
paddedBuffer.set(buffer, offset);
|
| 319 |
+
|
| 320 |
+
// Process speech
|
| 321 |
speechToSpeech(paddedBuffer, { start, end, duration });
|
| 322 |
|
| 323 |
+
// Handle overflow
|
| 324 |
if (overflow) {
|
| 325 |
BUFFER.set(overflow, 0);
|
| 326 |
}
|
| 327 |
resetAfterRecording(overflowLength);
|
| 328 |
};
|
| 329 |
|
| 330 |
+
// Message handler
|
| 331 |
self.onmessage = async (event) => {
|
| 332 |
const { type, buffer } = event.data;
|
| 333 |
|
| 334 |
+
// Block audio during playback
|
| 335 |
if (type === "audio" && isPlaying) return;
|
| 336 |
|
| 337 |
switch (type) {
|
|
|
|
| 343 |
case "end_call":
|
| 344 |
messages = [SYSTEM_MESSAGE];
|
| 345 |
past_key_values_cache = null;
|
| 346 |
+
// Fall through to interrupt
|
| 347 |
case "interrupt":
|
| 348 |
stopping_criteria?.interrupt();
|
| 349 |
return;
|
|
|
|
| 355 |
return;
|
| 356 |
}
|
| 357 |
|
| 358 |
+
// Process audio buffer
|
| 359 |
+
const wasRecording = isRecording;
|
| 360 |
const isSpeech = await vad(buffer);
|
| 361 |
|
| 362 |
if (!wasRecording && !isSpeech) {
|
| 363 |
+
// Queue non-speech buffers for padding
|
|
|
|
|
|
|
| 364 |
if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
|
|
|
|
| 365 |
prevBuffers.shift();
|
| 366 |
}
|
| 367 |
prevBuffers.push(buffer);
|
|
|
|
| 370 |
|
| 371 |
const remaining = BUFFER.length - bufferPointer;
|
| 372 |
if (buffer.length >= remaining) {
|
| 373 |
+
// Buffer overflow - trigger transcription
|
|
|
|
| 374 |
BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
|
| 375 |
bufferPointer += remaining;
|
| 376 |
|
|
|
|
| 377 |
const overflow = buffer.subarray(remaining);
|
| 378 |
dispatchForTranscriptionAndResetAudioBuffer(overflow);
|
| 379 |
return;
|
| 380 |
} else {
|
| 381 |
+
// Add to buffer
|
|
|
|
| 382 |
BUFFER.set(buffer, bufferPointer);
|
| 383 |
bufferPointer += buffer.length;
|
| 384 |
}
|
| 385 |
|
| 386 |
if (isSpeech) {
|
| 387 |
if (!isRecording) {
|
|
|
|
| 388 |
self.postMessage({
|
| 389 |
type: "status",
|
| 390 |
status: "recording_start",
|
|
|
|
| 392 |
duration: "until_next",
|
| 393 |
});
|
| 394 |
}
|
|
|
|
| 395 |
isRecording = true;
|
| 396 |
+
postSpeechSamples = 0;
|
| 397 |
return;
|
| 398 |
}
|
| 399 |
|
| 400 |
postSpeechSamples += buffer.length;
|
| 401 |
|
| 402 |
+
// Check for end of speech
|
|
|
|
| 403 |
if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
|
|
|
|
|
|
|
| 404 |
return;
|
| 405 |
}
|
| 406 |
|
| 407 |
if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
|
|
|
|
|
|
|
| 408 |
resetAfterRecording();
|
| 409 |
return;
|
| 410 |
}
|
|
|
|
| 412 |
dispatchForTranscriptionAndResetAudioBuffer();
|
| 413 |
};
|
| 414 |
|
| 415 |
+
// Greeting function
|
| 416 |
function greet(text) {
|
| 417 |
isPlaying = true;
|
| 418 |
const splitter = new TextSplitterStream();
|
| 419 |
const stream = tts.stream(splitter, { voice });
|
| 420 |
+
|
| 421 |
(async () => {
|
| 422 |
for await (const { text: chunkText, audio } of stream) {
|
| 423 |
self.postMessage({ type: "output", text: chunkText, result: audio });
|
| 424 |
}
|
| 425 |
})();
|
| 426 |
+
|
| 427 |
splitter.push(text);
|
| 428 |
splitter.close();
|
| 429 |
messages.push({ role: "assistant", content: text });
|
| 430 |
+
}
|