Spaces:
Running
Running
| /* Mini Piper implementation in Javascript. */ | |
| import EspeakModule from "./espeakng.worker.js"; | |
| const AUDIO_OUTPUT_SYNCHRONOUS = 2; | |
| const espeakCHARS_AUTO = 0; | |
| const CLAUSE_INTONATION_FULL_STOP = 0x00000000; | |
| const CLAUSE_INTONATION_COMMA = 0x00001000; | |
| const CLAUSE_INTONATION_QUESTION = 0x00002000; | |
| const CLAUSE_INTONATION_EXCLAMATION = 0x00003000; | |
| const CLAUSE_TYPE_CLAUSE = 0x00040000; | |
| const CLAUSE_TYPE_SENTENCE = 0x00080000; | |
| const CLAUSE_PERIOD = 40 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_SENTENCE; | |
| const CLAUSE_COMMA = 20 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE; | |
| const CLAUSE_QUESTION = 40 | CLAUSE_INTONATION_QUESTION | CLAUSE_TYPE_SENTENCE; | |
| const CLAUSE_EXCLAMATION = | |
| 45 | CLAUSE_INTONATION_EXCLAMATION | CLAUSE_TYPE_SENTENCE; | |
| const CLAUSE_COLON = 30 | CLAUSE_INTONATION_FULL_STOP | CLAUSE_TYPE_CLAUSE; | |
| const CLAUSE_SEMICOLON = 30 | CLAUSE_INTONATION_COMMA | CLAUSE_TYPE_CLAUSE; | |
| const BOS = "^"; | |
| const EOS = "$"; | |
| const PAD = "_"; | |
| let espeakInstance = null; | |
| let espeakInitialized = false; | |
| let voiceModel = null; | |
| let voiceConfig = null; | |
| async function setVoice(voiceModelUrl, voiceConfigUrl = undefined) { | |
| voiceConfigUrl = voiceConfigUrl ?? `${voiceModelUrl}.json`; | |
| const response = await fetch(voiceConfigUrl); | |
| if (!response.ok) { | |
| throw new Error(`Error loading voice configuration: {voiceConfigUrl}`); | |
| } | |
| voiceConfig = await response.json(); | |
| if (voiceConfig.phoneme_type == "espeak") { | |
| if (!espeakInstance) { | |
| espeakInstance = await EspeakModule(); | |
| espeakInstance._espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, 0, 0); | |
| } | |
| } | |
| voiceModel = await ort.InferenceSession.create(voiceModelUrl); | |
| } | |
| async function textToWavAudio( | |
| text, | |
| speakerId = undefined, | |
| noiseScale = undefined, | |
| lengthScale = undefined, | |
| noiseWScale = undefined, | |
| ) { | |
| if (!voiceConfig) { | |
| throw new Error("Voice is not set"); | |
| } | |
| const sampleRate = voiceConfig.audio.sample_rate; | |
| const float32Audio = await textToFloat32Audio( | |
| text, | |
| speakerId, | |
| noiseScale, | |
| lengthScale, | |
| noiseWScale, | |
| ); | |
| return float32ToWavBlob(float32Audio, sampleRate); | |
| } | |
| async function textToFloat32Audio( | |
| text, | |
| speakerId = undefined, | |
| lengthScale = undefined, | |
| noiseScale = undefined, | |
| noiseWScale = undefined, | |
| ) { | |
| if (!voiceConfig) { | |
| throw new Error("Voice is not set"); | |
| } | |
| lengthScale = lengthScale ?? voiceConfig.inference.length_scale ?? 1.0; | |
| noiseScale = noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667; | |
| noiseWScale = noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8; | |
| if (voiceConfig.num_speakers > 1) { | |
| speakerId = speakerId ?? 0; // first speaker | |
| } | |
| const textPhonemes = textToPhonemes(text); | |
| const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes); | |
| // Run onnx model | |
| const phonemeIdsTensor = new ort.Tensor( | |
| "int64", | |
| new BigInt64Array(phonemeIds.map((x) => BigInt(x))), | |
| [1, phonemeIds.length], | |
| ); | |
| const phonemeLengthsTensor = new ort.Tensor( | |
| "int64", | |
| BigInt64Array.from([BigInt(phonemeIds.length)]), | |
| [1], | |
| ); | |
| const scalesTensor = new ort.Tensor( | |
| "float32", | |
| Float32Array.from([noiseScale, lengthScale, noiseWScale]), | |
| [3], | |
| ); | |
| let feeds = { | |
| input: phonemeIdsTensor, | |
| input_lengths: phonemeLengthsTensor, | |
| scales: scalesTensor, | |
| }; | |
| if (voiceConfig.num_speakers > 1) { | |
| // Multi-speaker | |
| feeds["sid"] = new ort.Tensor( | |
| "int64", | |
| BigInt64Array.from([BigInt(speakerId)]), | |
| ); | |
| } | |
| const results = await voiceModel.run(feeds); | |
| const float32Audio = results.output.cpuData; | |
| return float32Audio; | |
| } | |
| function textToPhonemes(text) { | |
| if (!voiceConfig) { | |
| throw new Error("Voice is not set"); | |
| } | |
| if (voiceConfig.phoneme_type == "text") { | |
| // Text phonemes | |
| return [Array.from(text.normalize("NFD"))]; | |
| } | |
| if (!espeakInstance) { | |
| throw new Error("espeak-ng is not initialized"); | |
| } | |
| const voice = voiceConfig.espeak.voice; | |
| // Set voice | |
| const voicePtr = espeakInstance._malloc( | |
| espeakInstance.lengthBytesUTF8(voice) + 1, | |
| ); | |
| espeakInstance.stringToUTF8( | |
| voice, | |
| voicePtr, | |
| espeakInstance.lengthBytesUTF8(voice) + 1, | |
| ); | |
| espeakInstance._espeak_SetVoiceByName(voicePtr); | |
| espeakInstance._free(voicePtr); | |
| // Prepare text | |
| const textPtr = espeakInstance._malloc( | |
| espeakInstance.lengthBytesUTF8(text) + 1, | |
| ); | |
| espeakInstance.stringToUTF8( | |
| text, | |
| textPtr, | |
| espeakInstance.lengthBytesUTF8(text) + 1, | |
| ); | |
| const textPtrPtr = espeakInstance._malloc(4); | |
| espeakInstance.setValue(textPtrPtr, textPtr, "*"); | |
| // End of clause and sentences | |
| const terminatorPtr = espeakInstance._malloc(4); | |
| // Phoneme lists for each sentence | |
| const textPhonemes = []; | |
| // Phoneme list for current sentence | |
| let sentencePhonemes = []; | |
| while (true) { | |
| const phonemesPtr = espeakInstance._espeak_TextToPhonemesWithTerminator( | |
| textPtrPtr, | |
| espeakCHARS_AUTO, | |
| /* IPA */ 0x02, | |
| terminatorPtr, | |
| ); | |
| const clausePhonemes = espeakInstance.UTF8ToString(phonemesPtr); | |
| sentencePhonemes.push(clausePhonemes); | |
| const terminator = espeakInstance.getValue(terminatorPtr, "i32"); | |
| const punctuation = terminator & 0x000fffff; | |
| // Add punctuation phonemes | |
| if (punctuation === CLAUSE_PERIOD) { | |
| sentencePhonemes.push("."); | |
| } else if (punctuation === CLAUSE_QUESTION) { | |
| sentencePhonemes.push("?"); | |
| } else if (punctuation === CLAUSE_EXCLAMATION) { | |
| sentencePhonemes.push("!"); | |
| } else if (punctuation === CLAUSE_COMMA) { | |
| sentencePhonemes.push(", "); | |
| } else if (punctuation === CLAUSE_COLON) { | |
| sentencePhonemes.push(": "); | |
| } else if (punctuation === CLAUSE_SEMICOLON) { | |
| sentencePhonemes.push("; "); | |
| } | |
| if ((terminator & CLAUSE_TYPE_SENTENCE) === CLAUSE_TYPE_SENTENCE) { | |
| // End of sentence | |
| textPhonemes.push(sentencePhonemes); | |
| sentencePhonemes = []; | |
| } | |
| const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*"); | |
| if (nextTextPtr === 0) { | |
| break; // All text processed | |
| } | |
| // Advance text pointer | |
| espeakInstance.setValue(textPtrPtr, nextTextPtr, "*"); | |
| } | |
| // Clean up | |
| espeakInstance._free(textPtr); | |
| espeakInstance._free(textPtrPtr); | |
| espeakInstance._free(terminatorPtr); | |
| // Add lingering phonemes | |
| if (sentencePhonemes.length > 0) { | |
| textPhonemes.push(sentencePhonemes); | |
| sentencePhonemes = []; | |
| } | |
| // Prepare phonemes for Piper | |
| for (let i = 0; i < textPhonemes.length; i++) { | |
| textPhonemes[i] = Array.from(textPhonemes[i].join("").normalize("NFD")); | |
| } | |
| return textPhonemes; | |
| } | |
| function phonemesToIds(idMap, textPhonemes) { | |
| let phonemeIds = []; | |
| for (let sentencePhonemes of textPhonemes) { | |
| phonemeIds.push(idMap[BOS]); | |
| phonemeIds.push(idMap[PAD]); | |
| for (let phoneme of sentencePhonemes) { | |
| if (!(phoneme in idMap)) { | |
| continue; | |
| } | |
| phonemeIds.push(idMap[phoneme]); | |
| phonemeIds.push(idMap[PAD]); | |
| } | |
| phonemeIds.push(idMap[EOS]); | |
| } | |
| return phonemeIds; | |
| } | |
| function float32ToWavBlob(floatArray, sampleRate) { | |
| const int16 = new Int16Array(floatArray.length); | |
| for (let i = 0; i < floatArray.length; i++) { | |
| int16[i] = Math.max(-1, Math.min(1, floatArray[i])) * 32767; | |
| } | |
| const buffer = new ArrayBuffer(44 + int16.length * 2); | |
| const view = new DataView(buffer); | |
| const writeStr = (offset, str) => { | |
| for (let i = 0; i < str.length; i++) | |
| view.setUint8(offset + i, str.charCodeAt(i)); | |
| }; | |
| writeStr(0, "RIFF"); | |
| view.setUint32(4, 36 + int16.length * 2, true); | |
| writeStr(8, "WAVE"); | |
| writeStr(12, "fmt "); | |
| view.setUint32(16, 16, true); | |
| view.setUint16(20, 1, true); // PCM | |
| view.setUint16(22, 1, true); // mono | |
| view.setUint32(24, sampleRate, true); | |
| view.setUint32(28, sampleRate * 2, true); // byte rate | |
| view.setUint16(32, 2, true); // block align | |
| view.setUint16(34, 16, true); // bits per sample | |
| writeStr(36, "data"); | |
| view.setUint32(40, int16.length * 2, true); | |
| for (let i = 0; i < int16.length; i++) { | |
| view.setInt16(44 + i * 2, int16[i], true); | |
| } | |
| return new Blob([view], { type: "audio/wav" }); | |
| } | |
| export { setVoice, textToWavAudio, textToFloat32Audio }; | |