Banafo's picture
Adapt to using Kroko SDK and the new models
f06e713
// This file copies and modifies code
// from https://mdn.github.io/web-dictaphone/scripts/app.js
// and https://gist.github.com/meziantou/edb7217fddfbb70e899e
import AwaitLock from './await-lock.js';
import { KrokoWorker } from './kroko-sdk.js';
const languageSelect = document.getElementById('languageSelect');
const modelTypeToggle = document.getElementById('modelType');
const modelList = document.getElementById('modelList');
const startBtn = document.getElementById('recordBtn');
const soundClips = document.getElementById('fileInput');
function applyFilters() {
const lang = languageSelect.options[languageSelect.selectedIndex].value;
const type = modelTypeToggle.checked ? 'pro' : 'free';
filterModels(lang, type);
}
function filterModels(lang, type) {
for (const modelDiv of modelList.children) {
modelDiv.classList.toggle('hidden', modelDiv.dataset.lang !== lang || modelDiv.dataset.type !== type);
}
}
let started = false;
let textArea = document.getElementById('results');
let lastResult = '';
let resultList = [];
function getDisplayResult() {
let i = 0;
let ans = '';
for (let s in resultList) {
if (resultList[s] == '') {
continue;
}
ans += '' + i + ': ' + resultList[s] + '\n';
i += 1;
}
if (lastResult.length > 0) {
ans += '' + i + ': ' + lastResult + '\n';
}
return ans;
}
let audioCtx;
let mediaStream;
let expectedSampleRate = 16000;
let recordSampleRate; // the sampleRate of the microphone
let recorder = null; // the microphone
let leftchannel = []; // TODO: Use a single channel
let recordingLength = 0; // number of samples so far
let krokoWorker = new KrokoWorker();
let recognizer = null;
let recognizer_stream = null;
let streamLock = new AwaitLock;
async function getLanguages() {
const req = await fetch('https://license.kroko.ai/api/public/v1/languages');
if (!req.ok) {
throw new Error((await req.text()).trim());
}
return req.json();
}
async function getModels() {
const req = await fetch('https://license.kroko.ai/api/public/v1/models');
if (!req.ok) {
throw new Error((await req.text()).trim());
}
return req.json();
}
async function getModelUrl(modelId, license) {
const url = new URL('https://license.kroko.ai/api/public/v1/models');
url.searchParams.set('model_id', modelId);
url.searchParams.set('license', license);
const req = await fetch(url.toString());
if (!req.ok) {
throw new Error((await req.text()).trim());
}
const model = (await req.json())[0];
if (model === undefined) {
throw new Error('Invalid license key');
}
return model.url;
}
const languages = (await getLanguages()).sort((language1, language2) => language1.name.localeCompare(language2.name));
const models = (await getModels()).filter(model => model.streaming).sort((model1, model2) => model1.name.localeCompare(model2.name));
// #region languages
languages.forEach(language => {
const languageOption = document.createElement('option');
languageOption.value = language.iso;
languageOption.textContent = language.name;
languageSelect.appendChild(languageOption);
});
languageSelect.addEventListener('change', applyFilters);
// #endregion languages
// #region model type
modelTypeToggle.addEventListener('change', applyFilters);
// #endregion model type
// #region models
models.forEach(model => {
const modelDiv = document.createElement('div');
modelDiv.style.display = 'flex';
modelDiv.style.flexDirection = 'column';
modelDiv.style.gap = '0.5rem';
modelDiv.classList.add('hidden');
modelDiv.dataset.lang = model.language_iso;
modelDiv.dataset.type = model.type;
const nameDiv = document.createElement('div');
nameDiv.style.fontSize = 'larger';
nameDiv.style.fontWeight = 'bold';
nameDiv.textContent = model.name;
modelDiv.appendChild(nameDiv);
const streamingDiv = document.createElement('div');
streamingDiv.textContent = 'Streaming';
modelDiv.appendChild(streamingDiv);
const infoDiv = document.createElement('div');
const sizeMB = Math.round(model.file_size / 1000 / 1000);
const typeLabel = model.type === 'pro' ? 'Pro' : 'Community';
infoDiv.innerHTML = `<strong>Size:</strong> ${sizeMB}MB &nbsp; <strong>Type:</strong> ${typeLabel}`;
modelDiv.appendChild(infoDiv);
if (model.type === 'pro') {
const licenseDiv = document.createElement('div');
licenseDiv.style.color = 'darkred';
licenseDiv.textContent = 'Requires license key';
modelDiv.appendChild(licenseDiv);
}
const buttonDiv = document.createElement('div');
const button = document.createElement('button');
button.style.padding = '0.5rem 1rem';
button.style.border = '1px solid #e9ecef';
button.style.borderRadius = '4px';
button.style.backgroundColor = '#fff';
button.style.color = 'goldenrod';
button.style.cursor = 'pointer';
button.style.fontSize = '1rem';
button.classList.add('selectModel');
button.innerHTML = '&#x2713; Select';
button.onclick = async () => {
try {
let url = model.url;
let license = undefined;
if (model.type !== 'free') {
const license = prompt('Add license key');
if (license === null) return;
url = await getModelUrl(model.model_id, license);
}
document.getElementById('selectLanguageContent').classList.add('hidden');
document.getElementById('status').classList.remove('hidden');
document.getElementById('status').innerHTML = 'Downloading and loading model<b-dot>.</b-dot><b-dot>.</b-dot><b-dot>.</b-dot>';
await loadModel(url, license);
const modelInfoDiv = modelDiv.cloneNode(true);
const buttonRefreshDiv = document.createElement('div');
const buttonRefresh = document.createElement('button');
buttonRefresh.style.padding = '0.5rem 1rem';
buttonRefresh.style.border = '1px solid #e9ecef';
buttonRefresh.style.borderRadius = '4px';
buttonRefresh.style.backgroundColor = '#fff';
buttonRefresh.style.color = '#5cb85c';
buttonRefresh.style.cursor = 'pointer';
buttonRefresh.style.fontSize = '1rem';
buttonRefresh.innerHTML = 'Select another';
buttonRefresh.onclick = () => {
location.reload();
};
modelInfoDiv.querySelector('.selectModel').remove();
buttonRefreshDiv.appendChild(buttonRefresh);
modelInfoDiv.appendChild(buttonRefreshDiv);
document.getElementById('selectedModel').prepend(modelInfoDiv);
document.getElementById('selectedModelContent').classList.remove('hidden');
document.getElementById('singleAudioContent').classList.remove('hidden');
document.getElementById('status').classList.add('hidden');
} catch (ex) {
if (ex.message === 'Invalid license key') {
alert(ex.message);
} else {
document.getElementById('status').innerText = ex.message;
}
}
};
buttonDiv.appendChild(button);
modelDiv.appendChild(buttonDiv);
modelList.appendChild(modelDiv);
});
// #endregion models
document.getElementById('status').classList.add('hidden');
document.getElementById('selectLanguageContent').classList.remove('hidden');
// #region new to old model format tranformation
async function loadUnpackAndCacheModel(url, key) {
async function fetchWithCache(url) {
const cache = await caches.open('kroko-sdk');
let response = await cache.match?.(url);
if (response === undefined) {
response = await fetch(url);
if (!response.ok) {
throw new Error('Failed to fetch the file');
}
await cache.put(url, response.clone());
}
return response;
}
class ModelData {
constructor() {
this.filePath = "";
this.header = {};
this.blob = new Uint8Array(0);
this.encoder = null;
this.decoder = null;
this.joiner = null;
this.tokens = null;
}
async loadHeader(url) {
this.filePath = url;
const res = await fetchWithCache(url);
if (!res.ok) return false;
const arrayBuf = await res.arrayBuffer();
const data = new Uint8Array(arrayBuf);
if (data.byteLength < 4) return false;
const view = new DataView(data.buffer);
const headerLen = view.getUint32(0, true);
if (data.byteLength < 4 + headerLen) return false;
const headerBytes = data.slice(4, 4 + headerLen);
try {
const headerText = new TextDecoder().decode(headerBytes);
const j = JSON.parse(headerText);
this.header = j;
} catch (_) {
return false;
}
this.blob = data.slice(4 + headerLen);
return true;
}
async decryptPayload(password) {
if (!this.blob || this.blob.byteLength < 16) return false;
const iv = this.blob.slice(0, 16);
const ciphertext = this.blob.slice(16);
const keyBytes = new Uint8Array(32).fill("0".charCodeAt(0));
for (let i = 0; i < password.length && i < 32; i++) {
keyBytes[i] = password.charCodeAt(i);
}
let key;
try {
key = await window.crypto.subtle.importKey(
"raw",
keyBytes,
{ name: "AES-CBC" },
false,
["decrypt"]
);
} catch (e) {
console.error("Key import error:", e);
return false;
}
let plaintext;
try {
plaintext = await window.crypto.subtle.decrypt(
{ name: "AES-CBC", iv },
key,
ciphertext
);
} catch (e) {
console.error("Decryption error:", e);
return false;
}
const plainU8 = new Uint8Array(plaintext);
let offset = 0;
const readBlock = () => {
if (offset + 4 > plainU8.length) throw new Error("Invalid block header");
const len = new DataView(plainU8.buffer).getUint32(offset, true);
offset += 4;
if (offset + len > plainU8.length) throw new Error("Block size mismatch");
const buf = plainU8.slice(offset, offset + len);
offset += len;
return buf;
};
try {
this.encoder = readBlock();
this.decoder = readBlock();
this.joiner = readBlock();
this.tokens = readBlock();
} catch (e) {
console.error("Payload parsing error:", e);
return false;
}
return true;
}
loadPayload() {
if (!this.blob || this.blob.byteLength < 4) return false;
const data = this.blob;
let offset = 0;
const readBlock = () => {
if (offset + 4 > data.length) throw new Error("Invalid block header");
const len = new DataView(data.buffer).getUint32(offset, true);
offset += 4;
if (offset + len > data.length) throw new Error("Block size mismatch");
const buf = data.slice(offset, offset + len);
offset += len;
return buf;
};
try {
this.encoder = readBlock();
this.decoder = readBlock();
this.joiner = readBlock();
this.tokens = readBlock();
} catch (e) {
console.error("Payload parsing error:", e);
return false;
}
return true;
}
getHeaderValue(key) {
const val = this.header[key];
if (val === undefined) return "";
if (typeof val === "string") return val;
return JSON.stringify(val);
}
}
async function createDummyCacheEntry(name, contents) {
const dummyUrl = url + '/' + name;
const cache = await caches.open('kroko-sdk');
let response = await cache.match?.(dummyUrl);
if (response === undefined) {
response = new Response(contents, {status: 200, statusText: 'OK'});
await cache.put(dummyUrl, response.clone());
}
return dummyUrl;
}
const modelData = new ModelData();
await modelData.loadHeader(url);
if (key) {
await modelData.decryptPayload(key);
} else {
modelData.loadPayload();
}
return [
await createDummyCacheEntry('encoder', modelData.encoder),
await createDummyCacheEntry('decoder', modelData.decoder),
await createDummyCacheEntry('joiner', modelData.joiner),
await createDummyCacheEntry('tokens', modelData.tokens),
];
}
// #endregion new to old model format tranformation
async function loadModel(url, license) {
const [encoder, decoder, joiner, tokens] = await loadUnpackAndCacheModel(url);
recognizer = await krokoWorker.createOnlineRecognizer({
modelConfig: {
transducer: {
encoder: encoder,
decoder: decoder,
joiner: joiner,
},
tokens: tokens,
},
});
started = false;
console.log('recognizer is created!', recognizer);
}
if (navigator.mediaDevices.getUserMedia) {
console.log('getUserMedia supported.');
// see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
const constraints = {audio: true};
let onSuccess = function(stream) {
if (!audioCtx) {
audioCtx = new AudioContext({sampleRate: 16000});
}
console.log(audioCtx);
recordSampleRate = audioCtx.sampleRate;
console.log('sample rate ' + recordSampleRate);
// creates an audio node from the microphone incoming stream
mediaStream = audioCtx.createMediaStreamSource(stream);
console.log('media stream', mediaStream);
// https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor
// bufferSize: the onaudioprocess event is called when the buffer is full
var bufferSize = 4096;
var numberOfInputChannels = 1;
var numberOfOutputChannels = 2;
if (audioCtx.createScriptProcessor) {
recorder = audioCtx.createScriptProcessor(
bufferSize, numberOfInputChannels, numberOfOutputChannels);
} else {
recorder = audioCtx.createJavaScriptNode(
bufferSize, numberOfInputChannels, numberOfOutputChannels);
}
console.log('recorder', recorder);
recorder.onaudioprocess = async function(e) {
await streamLock.acquireAsync();
let samples = new Float32Array(e.inputBuffer.getChannelData(0))
samples = downsampleBuffer(samples, expectedSampleRate);
if (recognizer_stream == null) {
recognizer_stream = await recognizer.createStream();
}
await recognizer_stream.acceptWaveform(expectedSampleRate, samples);
while (await recognizer.isReady(recognizer_stream)) {
await recognizer.decode(recognizer_stream);
}
let isEndpoint = await recognizer.isEndpoint(recognizer_stream);
let result = (await recognizer.getResult(recognizer_stream)).text;
if (result.length > 0 && lastResult != result) {
lastResult = result;
}
if (isEndpoint) {
if (lastResult.length > 0) {
resultList.push(lastResult);
lastResult = '';
}
await recognizer.reset(recognizer_stream);
}
textArea.value = getDisplayResult();
textArea.scrollTop = textArea.scrollHeight; // auto scroll
let buf = new Int16Array(samples.length);
for (var i = 0; i < samples.length; ++i) {
let s = samples[i];
if (s >= 1)
s = 1;
else if (s <= -1)
s = -1;
samples[i] = s;
buf[i] = s * 32767;
}
leftchannel.push(buf);
recordingLength += bufferSize;
streamLock.release();
};
startBtn.onclick = function() {
if(started) {
console.log('recorder stopped');
recorder.disconnect(audioCtx.destination);
mediaStream.disconnect(recorder);
started = false;
var clipName = new Date().toISOString();
const clipContainer = document.createElement('article');
const clipLabel = document.createElement('p');
const audio = document.createElement('audio');
const deleteButton = document.createElement('button');
clipContainer.classList.add('clip');
audio.setAttribute('controls', '');
deleteButton.textContent = 'Delete';
deleteButton.className = 'delete';
clipLabel.textContent = clipName;
clipContainer.appendChild(audio);
clipContainer.appendChild(clipLabel);
clipContainer.appendChild(deleteButton);
soundClips.appendChild(clipContainer);
audio.controls = true;
let samples = flatten(leftchannel);
const blob = toWav(samples);
leftchannel = [];
const audioURL = window.URL.createObjectURL(blob);
audio.src = audioURL;
console.log('recorder stopped');
deleteButton.onclick = function(e) {
let evtTgt = e.target;
evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);
};
clipLabel.onclick = function() {
const existingName = clipLabel.textContent;
const newClipName = prompt('Enter a new name for your sound clip?');
if (newClipName === null) {
clipLabel.textContent = existingName;
} else {
clipLabel.textContent = newClipName;
}
};
}
else {
mediaStream.connect(recorder);
recorder.connect(audioCtx.destination);
console.log('recorder started');
started = true;
}
};
};
let onError = function(err) {
console.log('The following error occured: ' + err);
};
navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
} else {
console.log('getUserMedia not supported on your browser!');
alert('getUserMedia not supported on your browser!');
}
// this function is copied/modified from
// https://gist.github.com/meziantou/edb7217fddfbb70e899e
function flatten(listOfSamples) {
let n = 0;
for (let i = 0; i < listOfSamples.length; ++i) {
n += listOfSamples[i].length;
}
let ans = new Int16Array(n);
let offset = 0;
for (let i = 0; i < listOfSamples.length; ++i) {
ans.set(listOfSamples[i], offset);
offset += listOfSamples[i].length;
}
return ans;
}
// this function is copied/modified from
// https://gist.github.com/meziantou/edb7217fddfbb70e899e
function toWav(samples) {
let buf = new ArrayBuffer(44 + samples.length * 2);
var view = new DataView(buf);
// http://soundfile.sapp.org/doc/WaveFormat/
// F F I R
view.setUint32(0, 0x46464952, true); // chunkID
view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
// E V A W
view.setUint32(8, 0x45564157, true); // format
//
// t m f
view.setUint32(12, 0x20746d66, true); // subchunk1ID
view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
view.setUint32(20, 1, true); // audioFormat, 1 for PCM
view.setUint16(22, 1, true); // numChannels: 1 channel
view.setUint32(24, expectedSampleRate, true); // sampleRate
view.setUint32(28, expectedSampleRate * 2, true); // byteRate
view.setUint16(32, 2, true); // blockAlign
view.setUint16(34, 16, true); // bitsPerSample
view.setUint32(36, 0x61746164, true); // Subchunk2ID
view.setUint32(40, samples.length * 2, true); // subchunk2Size
let offset = 44;
for (let i = 0; i < samples.length; ++i) {
view.setInt16(offset, samples[i], true);
offset += 2;
}
return new Blob([view], {type: 'audio/wav'});
}
// this function is copied from
// https://github.com/awslabs/aws-lex-browser-audio-capture/blob/master/lib/worker.js#L46
function downsampleBuffer(buffer, exportSampleRate) {
if (exportSampleRate === recordSampleRate) {
return buffer;
}
var sampleRateRatio = recordSampleRate / exportSampleRate;
var newLength = Math.round(buffer.length / sampleRateRatio);
var result = new Float32Array(newLength);
var offsetResult = 0;
var offsetBuffer = 0;
while (offsetResult < result.length) {
var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
var accum = 0, count = 0;
for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = accum / count;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result;
};
const audios = []
const recorders = []
async function loadAudio(url) {
try {
const response = await fetch(url, { mode: "cors" });
if (!response.ok) throw new Error("Network response was not ok");
const blob = await response.blob();
const objectUrl = URL.createObjectURL(blob);
return new Audio(objectUrl);
} catch (error) {
console.error("Error loading audio:", error);
}
}
async function transcribe(url, output, index) {
let textarea = document.getElementById(output);
textarea.value = '';
let lastResult = '';
let resultList = [];
function getDisplayResult() {
let i = 0;
let ans = '';
for (let s in resultList) {
if (resultList[s] == '') {
continue;
}
ans += '' + i + ': ' + resultList[s] + '\n';
i += 1;
}
if (lastResult.length > 0) {
ans += '' + i + ': ' + lastResult + '\n';
}
return ans;
}
if(!audios[index]) {
audios[index] = await loadAudio(url);
}
audios[index].play();
console.log(audioCtx);
let recordSampleRate = audioCtx.sampleRate;
console.log('sample rate ' + recordSampleRate);
// https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor
// bufferSize: the onaudioprocess event is called when the buffer is full
var bufferSize = 4096;
var numberOfInputChannels = 1;
var numberOfOutputChannels = 2;
if(!recorders[index]) {
recorders[index] = audioCtx.createScriptProcessor(
bufferSize, numberOfInputChannels, numberOfOutputChannels);
let mediaStream = audioCtx.createMediaElementSource(audios[index]);
mediaStream.connect(recorders[index]);
recorders[index].connect(audioCtx.destination);
}
let recognizer_stream = null;
let streamLock = new AwaitLock;
recorders[index].onaudioprocess = async function(e) {
await streamLock.acquireAsync();
let samples = new Float32Array(e.inputBuffer.getChannelData(0))
e.outputBuffer.copyToChannel(samples, 0);
if (recognizer_stream == null) {
recognizer_stream = await recognizer.createStream();
}
await recognizer_stream.acceptWaveform(expectedSampleRate, samples);
while (await recognizer.isReady(recognizer_stream)) {
await recognizer.decode(recognizer_stream);
}
let isEndpoint = await recognizer.isEndpoint(recognizer_stream);
let result = (await recognizer.getResult(recognizer_stream)).text;
if (result.length > 0 && lastResult != result) {
lastResult = result;
}
if (isEndpoint) {
if (lastResult.length > 0) {
resultList.push(lastResult);
lastResult = '';
}
await recognizer.reset(recognizer_stream);
}
textarea.value = getDisplayResult();
textarea.scrollTop = textarea.scrollHeight; // auto scroll
let buf = new Int16Array(samples.length);
for (var i = 0; i < samples.length; ++i) {
let s = samples[i];
if (s >= 1)
s = 1;
else if (s <= -1)
s = -1;
samples[i] = s;
buf[i] = s * 32767;
}
streamLock.release();
};
recorders[index]?.addEventListener("recordingStopped", () => {
console.log("Decoding has stopped.");
mediaStream.disconnect(recorders[index]);
});
}
soundClips.addEventListener("change", function (event) {
if (!event.target.files || !event.target.files[0]) {
console.log("No file selected.");
return;
}
const file = event.target.files[0];
console.log("Selected file:", file.name, file.type, file.size, "bytes");
const reader = new FileReader();
reader.onload = function (ev) {
console.log("FileReader onload called.");
const arrayBuffer = ev.target.result;
console.log("ArrayBuffer length:", arrayBuffer.byteLength);
var url = URL.createObjectURL(file);
transcribe(url, 'results');
};
reader.onerror = function (err) {
console.error("FileReader error:", err);
};
console.log("Starting FileReader.readAsArrayBuffer...");
reader.readAsArrayBuffer(file);
});
// Load audio sources
// Microphone recording logic
const recordBtn = document.getElementById("recordBtn");
const audioPlayback = document.getElementById("audioPlayback");
let mediaRecorder;
let audioChunks = [];
recordBtn.addEventListener("click", async () => {
if (!mediaRecorder || mediaRecorder.state === "inactive") {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.ondataavailable = (event) => {
audioChunks.push(event.data);
};
mediaRecorder.onstop = () => {
const audioBlob = new Blob(audioChunks, { type: "audio/wav" });
audioChunks = [];
const audioURL = URL.createObjectURL(audioBlob);
audioPlayback.src = audioURL;
audioPlayback.style.display = "block";
//outputText.value = "Recording completed. Playback is ready.";
};
mediaRecorder.start();
recordBtn.textContent = "Stop Recording";
recordBtn.style.color = "#5cb85c";
//outputText.value = "Recording...";
} catch (err) {
//outputText.value = "Error accessing microphone: " + err.message;
}
} else if (mediaRecorder.state === "recording") {
mediaRecorder.stop();
recordBtn.textContent = "Use Microphone";
recordBtn.style.color = "#d9534f";
}
});