Spaces:

rhasspy
/

piper-demo

Running

App Files Files Community

piper-demo / js /piper.js

Michael Hansen

First working version

0c6d0de 8 months ago

history blame contribute delete

8.18 kB

	/* Mini Piper implementation in Javascript. */

	import EspeakModule from "./espeakng.worker.js";

	const AUDIO_OUTPUT_SYNCHRONOUS = 2;
	const espeakCHARS_AUTO = 0;

	const CLAUSE_INTONATION_FULL_STOP = 0x00000000;
	const CLAUSE_INTONATION_COMMA = 0x00001000;
	const CLAUSE_INTONATION_QUESTION = 0x00002000;
	const CLAUSE_INTONATION_EXCLAMATION = 0x00003000;

	const CLAUSE_TYPE_CLAUSE = 0x00040000;
	const CLAUSE_TYPE_SENTENCE = 0x00080000;

	const CLAUSE_PERIOD = 40 \| CLAUSE_INTONATION_FULL_STOP \| CLAUSE_TYPE_SENTENCE;
	const CLAUSE_COMMA = 20 \| CLAUSE_INTONATION_COMMA \| CLAUSE_TYPE_CLAUSE;
	const CLAUSE_QUESTION = 40 \| CLAUSE_INTONATION_QUESTION \| CLAUSE_TYPE_SENTENCE;
	const CLAUSE_EXCLAMATION =
	45 \| CLAUSE_INTONATION_EXCLAMATION \| CLAUSE_TYPE_SENTENCE;
	const CLAUSE_COLON = 30 \| CLAUSE_INTONATION_FULL_STOP \| CLAUSE_TYPE_CLAUSE;
	const CLAUSE_SEMICOLON = 30 \| CLAUSE_INTONATION_COMMA \| CLAUSE_TYPE_CLAUSE;

	const BOS = "^";
	const EOS = "$";
	const PAD = "_";

	let espeakInstance = null;
	let espeakInitialized = false;
	let voiceModel = null;
	let voiceConfig = null;

	async function setVoice(voiceModelUrl, voiceConfigUrl = undefined) {
	voiceConfigUrl = voiceConfigUrl ?? `${voiceModelUrl}.json`;

	const response = await fetch(voiceConfigUrl);
	if (!response.ok) {
	throw new Error(`Error loading voice configuration: {voiceConfigUrl}`);
	}
	voiceConfig = await response.json();

	if (voiceConfig.phoneme_type == "espeak") {
	if (!espeakInstance) {
	espeakInstance = await EspeakModule();
	espeakInstance._espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, 0, 0);
	}
	}

	voiceModel = await ort.InferenceSession.create(voiceModelUrl);
	}

	async function textToWavAudio(
	text,
	speakerId = undefined,
	noiseScale = undefined,
	lengthScale = undefined,
	noiseWScale = undefined,
	) {
	if (!voiceConfig) {
	throw new Error("Voice is not set");
	}

	const sampleRate = voiceConfig.audio.sample_rate;
	const float32Audio = await textToFloat32Audio(
	text,
	speakerId,
	noiseScale,
	lengthScale,
	noiseWScale,
	);

	return float32ToWavBlob(float32Audio, sampleRate);
	}

	async function textToFloat32Audio(
	text,
	speakerId = undefined,
	lengthScale = undefined,
	noiseScale = undefined,
	noiseWScale = undefined,
	) {
	if (!voiceConfig) {
	throw new Error("Voice is not set");
	}

	lengthScale = lengthScale ?? voiceConfig.inference.length_scale ?? 1.0;
	noiseScale = noiseScale ?? voiceConfig.inference.noise_scale ?? 0.667;
	noiseWScale = noiseWScale ?? voiceConfig.inference.noise_w ?? 0.8;

	if (voiceConfig.num_speakers > 1) {
	speakerId = speakerId ?? 0; // first speaker
	}

	const textPhonemes = textToPhonemes(text);
	const phonemeIds = phonemesToIds(voiceConfig.phoneme_id_map, textPhonemes);

	// Run onnx model
	const phonemeIdsTensor = new ort.Tensor(
	"int64",
	new BigInt64Array(phonemeIds.map((x) => BigInt(x))),
	[1, phonemeIds.length],
	);
	const phonemeLengthsTensor = new ort.Tensor(
	"int64",
	BigInt64Array.from([BigInt(phonemeIds.length)]),
	[1],
	);
	const scalesTensor = new ort.Tensor(
	"float32",
	Float32Array.from([noiseScale, lengthScale, noiseWScale]),
	[3],
	);

	let feeds = {
	input: phonemeIdsTensor,
	input_lengths: phonemeLengthsTensor,
	scales: scalesTensor,
	};

	if (voiceConfig.num_speakers > 1) {
	// Multi-speaker
	feeds["sid"] = new ort.Tensor(
	"int64",
	BigInt64Array.from([BigInt(speakerId)]),
	);
	}

	const results = await voiceModel.run(feeds);
	const float32Audio = results.output.cpuData;

	return float32Audio;
	}

	function textToPhonemes(text) {
	if (!voiceConfig) {
	throw new Error("Voice is not set");
	}

	if (voiceConfig.phoneme_type == "text") {
	// Text phonemes
	return [Array.from(text.normalize("NFD"))];
	}

	if (!espeakInstance) {
	throw new Error("espeak-ng is not initialized");
	}

	const voice = voiceConfig.espeak.voice;

	// Set voice
	const voicePtr = espeakInstance._malloc(
	espeakInstance.lengthBytesUTF8(voice) + 1,
	);
	espeakInstance.stringToUTF8(
	voice,
	voicePtr,
	espeakInstance.lengthBytesUTF8(voice) + 1,
	);
	espeakInstance._espeak_SetVoiceByName(voicePtr);
	espeakInstance._free(voicePtr);

	// Prepare text
	const textPtr = espeakInstance._malloc(
	espeakInstance.lengthBytesUTF8(text) + 1,
	);
	espeakInstance.stringToUTF8(
	text,
	textPtr,
	espeakInstance.lengthBytesUTF8(text) + 1,
	);

	const textPtrPtr = espeakInstance._malloc(4);
	espeakInstance.setValue(textPtrPtr, textPtr, "*");

	// End of clause and sentences
	const terminatorPtr = espeakInstance._malloc(4);

	// Phoneme lists for each sentence
	const textPhonemes = [];

	// Phoneme list for current sentence
	let sentencePhonemes = [];

	while (true) {
	const phonemesPtr = espeakInstance._espeak_TextToPhonemesWithTerminator(
	textPtrPtr,
	espeakCHARS_AUTO,
	/* IPA */ 0x02,
	terminatorPtr,
	);
	const clausePhonemes = espeakInstance.UTF8ToString(phonemesPtr);
	sentencePhonemes.push(clausePhonemes);

	const terminator = espeakInstance.getValue(terminatorPtr, "i32");
	const punctuation = terminator & 0x000fffff;

	// Add punctuation phonemes
	if (punctuation === CLAUSE_PERIOD) {
	sentencePhonemes.push(".");
	} else if (punctuation === CLAUSE_QUESTION) {
	sentencePhonemes.push("?");
	} else if (punctuation === CLAUSE_EXCLAMATION) {
	sentencePhonemes.push("!");
	} else if (punctuation === CLAUSE_COMMA) {
	sentencePhonemes.push(", ");
	} else if (punctuation === CLAUSE_COLON) {
	sentencePhonemes.push(": ");
	} else if (punctuation === CLAUSE_SEMICOLON) {
	sentencePhonemes.push("; ");
	}

	if ((terminator & CLAUSE_TYPE_SENTENCE) === CLAUSE_TYPE_SENTENCE) {
	// End of sentence
	textPhonemes.push(sentencePhonemes);
	sentencePhonemes = [];
	}

	const nextTextPtr = espeakInstance.getValue(textPtrPtr, "*");
	if (nextTextPtr === 0) {
	break; // All text processed
	}

	// Advance text pointer
	espeakInstance.setValue(textPtrPtr, nextTextPtr, "*");
	}

	// Clean up
	espeakInstance._free(textPtr);
	espeakInstance._free(textPtrPtr);
	espeakInstance._free(terminatorPtr);

	// Add lingering phonemes
	if (sentencePhonemes.length > 0) {
	textPhonemes.push(sentencePhonemes);
	sentencePhonemes = [];
	}

	// Prepare phonemes for Piper
	for (let i = 0; i < textPhonemes.length; i++) {
	textPhonemes[i] = Array.from(textPhonemes[i].join("").normalize("NFD"));
	}

	return textPhonemes;
	}

	function phonemesToIds(idMap, textPhonemes) {
	let phonemeIds = [];

	for (let sentencePhonemes of textPhonemes) {
	phonemeIds.push(idMap[BOS]);
	phonemeIds.push(idMap[PAD]);

	for (let phoneme of sentencePhonemes) {
	if (!(phoneme in idMap)) {
	continue;
	}

	phonemeIds.push(idMap[phoneme]);
	phonemeIds.push(idMap[PAD]);
	}

	phonemeIds.push(idMap[EOS]);
	}

	return phonemeIds;
	}

	function float32ToWavBlob(floatArray, sampleRate) {
	const int16 = new Int16Array(floatArray.length);
	for (let i = 0; i < floatArray.length; i++) {
	int16[i] = Math.max(-1, Math.min(1, floatArray[i])) * 32767;
	}

	const buffer = new ArrayBuffer(44 + int16.length * 2);
	const view = new DataView(buffer);

	const writeStr = (offset, str) => {
	for (let i = 0; i < str.length; i++)
	view.setUint8(offset + i, str.charCodeAt(i));
	};

	writeStr(0, "RIFF");
	view.setUint32(4, 36 + int16.length * 2, true);
	writeStr(8, "WAVE");
	writeStr(12, "fmt ");
	view.setUint32(16, 16, true);
	view.setUint16(20, 1, true); // PCM
	view.setUint16(22, 1, true); // mono
	view.setUint32(24, sampleRate, true);
	view.setUint32(28, sampleRate * 2, true); // byte rate
	view.setUint16(32, 2, true); // block align
	view.setUint16(34, 16, true); // bits per sample
	writeStr(36, "data");
	view.setUint32(40, int16.length * 2, true);

	for (let i = 0; i < int16.length; i++) {
	view.setInt16(44 + i * 2, int16[i], true);
	}

	return new Blob([view], { type: "audio/wav" });
	}

	export { setVoice, textToWavAudio, textToFloat32Audio };