Upload 13 files
Browse files- .gitattributes +1 -0
- eslint.config.js +33 -0
- index.html +13 -0
- package-lock.json +0 -0
- package.json +25 -32
- public/logo.png +3 -0
- src/App.jsx +367 -0
- src/constants.js +53 -0
- src/index.css +12 -0
- src/main.jsx +10 -0
- src/play-worklet.js +73 -0
- src/vad-processor.js +37 -0
- src/worker.js +355 -0
- vite.config.js +19 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
public/logo.png filter=lfs diff=lfs merge=lfs -text
|
eslint.config.js
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import js from "@eslint/js";
|
2 |
+
import globals from "globals";
|
3 |
+
import reactHooks from "eslint-plugin-react-hooks";
|
4 |
+
import reactRefresh from "eslint-plugin-react-refresh";
|
5 |
+
|
6 |
+
export default [
|
7 |
+
{ ignores: ["dist"] },
|
8 |
+
{
|
9 |
+
files: ["**/*.{js,jsx}"],
|
10 |
+
languageOptions: {
|
11 |
+
ecmaVersion: 2020,
|
12 |
+
globals: globals.browser,
|
13 |
+
parserOptions: {
|
14 |
+
ecmaVersion: "latest",
|
15 |
+
ecmaFeatures: { jsx: true },
|
16 |
+
sourceType: "module",
|
17 |
+
},
|
18 |
+
},
|
19 |
+
plugins: {
|
20 |
+
"react-hooks": reactHooks,
|
21 |
+
"react-refresh": reactRefresh,
|
22 |
+
},
|
23 |
+
rules: {
|
24 |
+
...js.configs.recommended.rules,
|
25 |
+
...reactHooks.configs.recommended.rules,
|
26 |
+
"no-unused-vars": ["error", { varsIgnorePattern: "^[A-Z_]" }],
|
27 |
+
"react-refresh/only-export-components": [
|
28 |
+
"warn",
|
29 |
+
{ allowConstantExport: true },
|
30 |
+
],
|
31 |
+
},
|
32 |
+
},
|
33 |
+
];
|
index.html
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!doctype html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8" />
|
5 |
+
<link rel="icon" type="image/png" href="/logo.png" />
|
6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
7 |
+
<title>Transformers.js | Speech-to-speech demo</title>
|
8 |
+
</head>
|
9 |
+
<body>
|
10 |
+
<div id="root"></div>
|
11 |
+
<script type="module" src="/src/main.jsx"></script>
|
12 |
+
</body>
|
13 |
+
</html>
|
package-lock.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
package.json
CHANGED
@@ -1,39 +1,32 @@
|
|
1 |
{
|
2 |
-
"name": "
|
3 |
-
"version": "0.1.0",
|
4 |
"private": true,
|
5 |
-
"
|
6 |
-
|
7 |
-
"@testing-library/jest-dom": "^6.6.3",
|
8 |
-
"@testing-library/react": "^16.3.0",
|
9 |
-
"@testing-library/user-event": "^13.5.0",
|
10 |
-
"react": "^19.1.0",
|
11 |
-
"react-dom": "^19.1.0",
|
12 |
-
"react-scripts": "5.0.1",
|
13 |
-
"web-vitals": "^2.1.4"
|
14 |
-
},
|
15 |
"scripts": {
|
16 |
-
"
|
17 |
-
"build": "
|
18 |
-
"
|
19 |
-
"
|
20 |
},
|
21 |
-
"
|
22 |
-
"
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
26 |
},
|
27 |
-
"
|
28 |
-
"
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
"
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
]
|
38 |
}
|
39 |
}
|
|
|
1 |
{
|
2 |
+
"name": "speech-to-speech",
|
|
|
3 |
"private": true,
|
4 |
+
"version": "0.0.0",
|
5 |
+
"type": "module",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"scripts": {
|
7 |
+
"dev": "vite",
|
8 |
+
"build": "vite build",
|
9 |
+
"lint": "eslint .",
|
10 |
+
"preview": "vite preview"
|
11 |
},
|
12 |
+
"dependencies": {
|
13 |
+
"@huggingface/transformers": "^3.5.2",
|
14 |
+
"@tailwindcss/vite": "^4.1.4",
|
15 |
+
"kokoro-js": "^1.2.1",
|
16 |
+
"lucide-react": "^0.503.0",
|
17 |
+
"react": "^19.0.0",
|
18 |
+
"react-dom": "^19.0.0",
|
19 |
+
"tailwindcss": "^4.1.4"
|
20 |
},
|
21 |
+
"devDependencies": {
|
22 |
+
"@eslint/js": "^9.22.0",
|
23 |
+
"@types/react": "^19.0.10",
|
24 |
+
"@types/react-dom": "^19.0.4",
|
25 |
+
"@vitejs/plugin-react": "^4.3.4",
|
26 |
+
"eslint": "^9.22.0",
|
27 |
+
"eslint-plugin-react-hooks": "^5.2.0",
|
28 |
+
"eslint-plugin-react-refresh": "^0.4.19",
|
29 |
+
"globals": "^16.0.0",
|
30 |
+
"vite": "^6.3.1"
|
|
|
31 |
}
|
32 |
}
|
public/logo.png
ADDED
![]() |
Git LFS Details
|
src/App.jsx
ADDED
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { useEffect, useState, useRef } from "react";
|
2 |
+
import { Mic, PhoneOff, ChevronDown } from "lucide-react";
|
3 |
+
import { INPUT_SAMPLE_RATE } from "./constants";
|
4 |
+
|
5 |
+
import WORKLET from "./play-worklet.js";
|
6 |
+
|
7 |
+
export default function App() {
|
8 |
+
const [callStartTime, setCallStartTime] = useState(null);
|
9 |
+
const [callStarted, setCallStarted] = useState(false);
|
10 |
+
const [playing, setPlaying] = useState(false);
|
11 |
+
|
12 |
+
const [voice, setVoice] = useState("af_heart");
|
13 |
+
const [voices, setVoices] = useState([]);
|
14 |
+
|
15 |
+
const [isListening, setIsListening] = useState(false);
|
16 |
+
const [isSpeaking, setIsSpeaking] = useState(false);
|
17 |
+
const [listeningScale, setListeningScale] = useState(1);
|
18 |
+
const [speakingScale, setSpeakingScale] = useState(1);
|
19 |
+
const [ripples, setRipples] = useState([]);
|
20 |
+
|
21 |
+
const [ready, setReady] = useState(false);
|
22 |
+
const [error, setError] = useState(null);
|
23 |
+
const [elapsedTime, setElapsedTime] = useState("00:00");
|
24 |
+
const worker = useRef(null);
|
25 |
+
|
26 |
+
const node = useRef(null);
|
27 |
+
|
28 |
+
useEffect(() => {
|
29 |
+
worker.current?.postMessage({
|
30 |
+
type: "set_voice",
|
31 |
+
voice,
|
32 |
+
});
|
33 |
+
}, [voice]);
|
34 |
+
|
35 |
+
useEffect(() => {
|
36 |
+
if (!callStarted) {
|
37 |
+
// Reset worker state after call ends
|
38 |
+
worker.current?.postMessage({
|
39 |
+
type: "end_call",
|
40 |
+
});
|
41 |
+
}
|
42 |
+
}, [callStarted]);
|
43 |
+
|
44 |
+
useEffect(() => {
|
45 |
+
if (callStarted && callStartTime) {
|
46 |
+
const interval = setInterval(() => {
|
47 |
+
const diff = Math.floor((Date.now() - callStartTime) / 1000);
|
48 |
+
const minutes = String(Math.floor(diff / 60)).padStart(2, "0");
|
49 |
+
const seconds = String(diff % 60).padStart(2, "0");
|
50 |
+
setElapsedTime(`${minutes}:${seconds}`);
|
51 |
+
}, 1000);
|
52 |
+
return () => clearInterval(interval);
|
53 |
+
} else {
|
54 |
+
setElapsedTime("00:00");
|
55 |
+
}
|
56 |
+
}, [callStarted, callStartTime]);
|
57 |
+
|
58 |
+
useEffect(() => {
|
59 |
+
worker.current ??= new Worker(new URL("./worker.js", import.meta.url), {
|
60 |
+
type: "module",
|
61 |
+
});
|
62 |
+
|
63 |
+
const onMessage = ({ data }) => {
|
64 |
+
console.log("Worker message:", data);
|
65 |
+
if (data.error) {
|
66 |
+
return onError(data.error);
|
67 |
+
}
|
68 |
+
|
69 |
+
switch (data.type) {
|
70 |
+
case "status":
|
71 |
+
if (data.status === "recording_start") {
|
72 |
+
setIsListening(true);
|
73 |
+
setIsSpeaking(false);
|
74 |
+
} else if (data.status === "recording_end") {
|
75 |
+
setIsListening(false);
|
76 |
+
} else if (data.status === "ready") {
|
77 |
+
setVoices(data.voices);
|
78 |
+
setReady(true);
|
79 |
+
}
|
80 |
+
break;
|
81 |
+
case "output":
|
82 |
+
if (!playing) {
|
83 |
+
node.current?.port.postMessage(data.result.audio);
|
84 |
+
setPlaying(true);
|
85 |
+
setIsSpeaking(true);
|
86 |
+
setIsListening(false);
|
87 |
+
}
|
88 |
+
break;
|
89 |
+
}
|
90 |
+
};
|
91 |
+
const onError = (err) => setError(err.message);
|
92 |
+
|
93 |
+
worker.current.addEventListener("message", onMessage);
|
94 |
+
worker.current.addEventListener("error", onError);
|
95 |
+
|
96 |
+
return () => {
|
97 |
+
worker.current.removeEventListener("message", onMessage);
|
98 |
+
worker.current.removeEventListener("error", onError);
|
99 |
+
};
|
100 |
+
}, []);
|
101 |
+
|
102 |
+
useEffect(() => {
|
103 |
+
if (!callStarted) return;
|
104 |
+
|
105 |
+
let worklet;
|
106 |
+
let inputAudioContext;
|
107 |
+
let source;
|
108 |
+
let ignore = false;
|
109 |
+
|
110 |
+
let outputAudioContext;
|
111 |
+
const audioStreamPromise = navigator.mediaDevices.getUserMedia({
|
112 |
+
audio: {
|
113 |
+
channelCount: 1,
|
114 |
+
echoCancellation: true,
|
115 |
+
autoGainControl: true,
|
116 |
+
noiseSuppression: true,
|
117 |
+
sampleRate: INPUT_SAMPLE_RATE,
|
118 |
+
},
|
119 |
+
});
|
120 |
+
|
121 |
+
audioStreamPromise
|
122 |
+
.then(async (stream) => {
|
123 |
+
if (ignore) return;
|
124 |
+
|
125 |
+
inputAudioContext = new (window.AudioContext ||
|
126 |
+
window.webkitAudioContext)({
|
127 |
+
sampleRate: INPUT_SAMPLE_RATE,
|
128 |
+
});
|
129 |
+
|
130 |
+
const analyser = inputAudioContext.createAnalyser();
|
131 |
+
analyser.fftSize = 256;
|
132 |
+
source = inputAudioContext.createMediaStreamSource(stream);
|
133 |
+
source.connect(analyser);
|
134 |
+
|
135 |
+
const inputDataArray = new Uint8Array(analyser.frequencyBinCount);
|
136 |
+
|
137 |
+
function calculateRMS(array) {
|
138 |
+
let sum = 0;
|
139 |
+
for (let i = 0; i < array.length; ++i) {
|
140 |
+
const normalized = array[i] / 128 - 1;
|
141 |
+
sum += normalized * normalized;
|
142 |
+
}
|
143 |
+
const rms = Math.sqrt(sum / array.length);
|
144 |
+
return rms;
|
145 |
+
}
|
146 |
+
|
147 |
+
await inputAudioContext.audioWorklet.addModule(
|
148 |
+
new URL("./vad-processor.js", import.meta.url),
|
149 |
+
);
|
150 |
+
worklet = new AudioWorkletNode(inputAudioContext, "vad-processor", {
|
151 |
+
numberOfInputs: 1,
|
152 |
+
numberOfOutputs: 0,
|
153 |
+
channelCount: 1,
|
154 |
+
channelCountMode: "explicit",
|
155 |
+
channelInterpretation: "discrete",
|
156 |
+
});
|
157 |
+
|
158 |
+
source.connect(worklet);
|
159 |
+
worklet.port.onmessage = (event) => {
|
160 |
+
const { buffer } = event.data;
|
161 |
+
worker.current?.postMessage({ type: "audio", buffer });
|
162 |
+
};
|
163 |
+
|
164 |
+
outputAudioContext = new AudioContext({
|
165 |
+
sampleRate: 24000,
|
166 |
+
});
|
167 |
+
outputAudioContext.resume();
|
168 |
+
|
169 |
+
const blob = new Blob([`(${WORKLET.toString()})()`], {
|
170 |
+
type: "application/javascript",
|
171 |
+
});
|
172 |
+
const url = URL.createObjectURL(blob);
|
173 |
+
await outputAudioContext.audioWorklet.addModule(url);
|
174 |
+
URL.revokeObjectURL(url);
|
175 |
+
|
176 |
+
node.current = new AudioWorkletNode(
|
177 |
+
outputAudioContext,
|
178 |
+
"buffered-audio-worklet-processor",
|
179 |
+
);
|
180 |
+
|
181 |
+
node.current.port.onmessage = (event) => {
|
182 |
+
if (event.data.type === "playback_ended") {
|
183 |
+
setPlaying(false);
|
184 |
+
setIsSpeaking(false);
|
185 |
+
worker.current?.postMessage({ type: "playback_ended" });
|
186 |
+
}
|
187 |
+
};
|
188 |
+
|
189 |
+
const outputAnalyser = outputAudioContext.createAnalyser();
|
190 |
+
outputAnalyser.fftSize = 256;
|
191 |
+
|
192 |
+
node.current.connect(outputAnalyser);
|
193 |
+
outputAnalyser.connect(outputAudioContext.destination);
|
194 |
+
|
195 |
+
const outputDataArray = new Uint8Array(
|
196 |
+
outputAnalyser.frequencyBinCount,
|
197 |
+
);
|
198 |
+
|
199 |
+
function updateVisualizers() {
|
200 |
+
analyser.getByteTimeDomainData(inputDataArray);
|
201 |
+
const rms = calculateRMS(inputDataArray);
|
202 |
+
const targetScale = 1 + Math.min(1.25 * rms, 0.25);
|
203 |
+
setListeningScale((prev) => prev + (targetScale - prev) * 0.25);
|
204 |
+
|
205 |
+
outputAnalyser.getByteTimeDomainData(outputDataArray);
|
206 |
+
const outputRMS = calculateRMS(outputDataArray);
|
207 |
+
const targetOutputScale = 1 + Math.min(1.25 * outputRMS, 0.25);
|
208 |
+
setSpeakingScale((prev) => prev + (targetOutputScale - prev) * 0.25);
|
209 |
+
|
210 |
+
requestAnimationFrame(updateVisualizers);
|
211 |
+
}
|
212 |
+
updateVisualizers();
|
213 |
+
})
|
214 |
+
.catch((err) => {
|
215 |
+
setError(err.message);
|
216 |
+
console.error(err);
|
217 |
+
});
|
218 |
+
|
219 |
+
return () => {
|
220 |
+
ignore = true;
|
221 |
+
|
222 |
+
audioStreamPromise.then((stream) =>
|
223 |
+
stream.getTracks().forEach((track) => track.stop()),
|
224 |
+
);
|
225 |
+
source?.disconnect();
|
226 |
+
worklet?.disconnect();
|
227 |
+
inputAudioContext?.close();
|
228 |
+
|
229 |
+
outputAudioContext?.close();
|
230 |
+
};
|
231 |
+
}, [callStarted]);
|
232 |
+
|
233 |
+
useEffect(() => {
|
234 |
+
if (!callStarted) return;
|
235 |
+
const interval = setInterval(() => {
|
236 |
+
const id = Date.now();
|
237 |
+
setRipples((prev) => [...prev, id]);
|
238 |
+
setTimeout(() => {
|
239 |
+
setRipples((prev) => prev.filter((r) => r !== id));
|
240 |
+
}, 1500);
|
241 |
+
}, 1000);
|
242 |
+
return () => clearInterval(interval);
|
243 |
+
}, [callStarted]);
|
244 |
+
|
245 |
+
return (
|
246 |
+
<div className="h-screen min-h-[240px] flex items-center justify-center bg-gray-50 p-4 relative">
|
247 |
+
<div className="h-full max-h-[320px] w-[640px] bg-white rounded-xl shadow-lg p-8 flex items-center justify-between space-x-16">
|
248 |
+
<div className="text-green-700 w-[140px]">
|
249 |
+
<div className="text-xl font-bold flex justify-between">
|
250 |
+
{voices?.[voice]?.name}
|
251 |
+
<span className="font-normal text-gray-500">{elapsedTime}</span>
|
252 |
+
</div>
|
253 |
+
<div className="text-base relative">
|
254 |
+
<button
|
255 |
+
type="button"
|
256 |
+
disabled={!ready}
|
257 |
+
className={`w-full flex items-center justify-between border border-gray-300 rounded-md transition-colors ${
|
258 |
+
ready
|
259 |
+
? "bg-transparent hover:border-gray-400"
|
260 |
+
: "bg-gray-100 opacity-50 cursor-not-allowed"
|
261 |
+
}`}
|
262 |
+
>
|
263 |
+
<span className="px-2 py-1">Select voice</span>
|
264 |
+
<ChevronDown className="absolute right-2" />
|
265 |
+
</button>
|
266 |
+
<select
|
267 |
+
value={voice}
|
268 |
+
onChange={(e) => setVoice(e.target.value)}
|
269 |
+
className="absolute inset-0 opacity-0 cursor-pointer"
|
270 |
+
disabled={!ready}
|
271 |
+
>
|
272 |
+
{Object.entries(voices).map(([key, v]) => (
|
273 |
+
<option key={key} value={key}>
|
274 |
+
{`${v.name} (${
|
275 |
+
v.language === "en-us" ? "American" : v.language
|
276 |
+
} ${v.gender})`}
|
277 |
+
</option>
|
278 |
+
))}
|
279 |
+
</select>
|
280 |
+
</div>
|
281 |
+
</div>
|
282 |
+
|
283 |
+
<div className="relative flex items-center justify-center w-32 h-32 flex-shrink-0 aspect-square">
|
284 |
+
{callStarted &&
|
285 |
+
ripples.map((id) => (
|
286 |
+
<div
|
287 |
+
key={id}
|
288 |
+
className="absolute inset-0 rounded-full border-2 border-green-200 pointer-events-none"
|
289 |
+
style={{ animation: "ripple 1.5s ease-out forwards" }}
|
290 |
+
/>
|
291 |
+
))}
|
292 |
+
<div className="absolute z-10 text-lg text-gray-700">
|
293 |
+
{!ready ? "Loading..." : ""}
|
294 |
+
{isListening && "Listening..."}
|
295 |
+
{isSpeaking && "Speaking..."}
|
296 |
+
</div>
|
297 |
+
{/* Pulsing loader while initializing */}
|
298 |
+
<div
|
299 |
+
className={`absolute w-32 h-32 rounded-full bg-green-200 ${
|
300 |
+
!ready ? "animate-ping opacity-75" : ""
|
301 |
+
}`}
|
302 |
+
style={{ animationDuration: "1.5s" }}
|
303 |
+
/>
|
304 |
+
{/* Main rings */}
|
305 |
+
<div
|
306 |
+
className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-300 ${
|
307 |
+
!ready ? "opacity-0" : ""
|
308 |
+
}`}
|
309 |
+
style={{ transform: `scale(${speakingScale})` }}
|
310 |
+
/>
|
311 |
+
<div
|
312 |
+
className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-200 ${
|
313 |
+
!ready ? "opacity-0" : ""
|
314 |
+
}`}
|
315 |
+
style={{ transform: `scale(${listeningScale})` }}
|
316 |
+
/>
|
317 |
+
</div>
|
318 |
+
|
319 |
+
<div className="space-y-4 w-[140px]">
|
320 |
+
{callStarted ? (
|
321 |
+
<button
|
322 |
+
className="flex items-center space-x-2 px-4 py-2 bg-red-100 text-red-700 rounded-md hover:bg-red-200"
|
323 |
+
onClick={() => {
|
324 |
+
setCallStarted(false);
|
325 |
+
setCallStartTime(null);
|
326 |
+
setPlaying(false);
|
327 |
+
setIsListening(false);
|
328 |
+
setIsSpeaking(false);
|
329 |
+
}}
|
330 |
+
>
|
331 |
+
<PhoneOff className="w-5 h-5" />
|
332 |
+
<span>End call</span>
|
333 |
+
</button>
|
334 |
+
) : (
|
335 |
+
<button
|
336 |
+
className={`flex items-center space-x-2 px-4 py-2 rounded-md ${
|
337 |
+
ready
|
338 |
+
? "bg-blue-100 text-blue-700 hover:bg-blue-200"
|
339 |
+
: "bg-blue-100 text-blue-700 opacity-50 cursor-not-allowed"
|
340 |
+
}`}
|
341 |
+
onClick={() => {
|
342 |
+
setCallStartTime(Date.now());
|
343 |
+
setCallStarted(true);
|
344 |
+
worker.current?.postMessage({ type: "start_call" });
|
345 |
+
}}
|
346 |
+
disabled={!ready}
|
347 |
+
>
|
348 |
+
<span>Start call</span>
|
349 |
+
</button>
|
350 |
+
)}
|
351 |
+
</div>
|
352 |
+
</div>
|
353 |
+
|
354 |
+
<div className="absolute bottom-4 text-sm">
|
355 |
+
Built with{" "}
|
356 |
+
<a
|
357 |
+
href="https://github.com/huggingface/transformers.js"
|
358 |
+
rel="noopener noreferrer"
|
359 |
+
target="_blank"
|
360 |
+
className="text-blue-600 hover:underline"
|
361 |
+
>
|
362 |
+
🤗 Transformers.js
|
363 |
+
</a>
|
364 |
+
</div>
|
365 |
+
</div>
|
366 |
+
);
|
367 |
+
}
|
src/constants.js
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/**
|
2 |
+
* Sample rate of the input audio.
|
3 |
+
* Coindicentally, this is the same for both models (Moonshine and Silero VAD)
|
4 |
+
*/
|
5 |
+
export const INPUT_SAMPLE_RATE = 16000;
|
6 |
+
const INPUT_SAMPLE_RATE_MS = INPUT_SAMPLE_RATE / 1000;
|
7 |
+
|
8 |
+
/**
|
9 |
+
* Probabilities ABOVE this value are considered as SPEECH
|
10 |
+
*/
|
11 |
+
export const SPEECH_THRESHOLD = 0.3;
|
12 |
+
|
13 |
+
/**
|
14 |
+
* If current state is SPEECH, and the probability of the next state
|
15 |
+
* is below this value, it is considered as NON-SPEECH.
|
16 |
+
*/
|
17 |
+
export const EXIT_THRESHOLD = 0.1;
|
18 |
+
|
19 |
+
/**
|
20 |
+
* After each speech chunk, wait for at least this amount of silence
|
21 |
+
* before considering the next chunk as a new speech chunk
|
22 |
+
*/
|
23 |
+
export const MIN_SILENCE_DURATION_MS = 400;
|
24 |
+
export const MIN_SILENCE_DURATION_SAMPLES =
|
25 |
+
MIN_SILENCE_DURATION_MS * INPUT_SAMPLE_RATE_MS;
|
26 |
+
|
27 |
+
/**
|
28 |
+
* Pad the speech chunk with this amount each side
|
29 |
+
*/
|
30 |
+
export const SPEECH_PAD_MS = 80;
|
31 |
+
export const SPEECH_PAD_SAMPLES = SPEECH_PAD_MS * INPUT_SAMPLE_RATE_MS;
|
32 |
+
|
33 |
+
/**
|
34 |
+
* Final speech chunks below this duration are discarded
|
35 |
+
*/
|
36 |
+
export const MIN_SPEECH_DURATION_SAMPLES = 250 * INPUT_SAMPLE_RATE_MS; // 250 ms
|
37 |
+
|
38 |
+
/**
|
39 |
+
* Maximum duration of audio that can be handled by Moonshine
|
40 |
+
*/
|
41 |
+
export const MAX_BUFFER_DURATION = 30;
|
42 |
+
|
43 |
+
/**
|
44 |
+
* Size of the incoming buffers
|
45 |
+
*/
|
46 |
+
export const NEW_BUFFER_SIZE = 512;
|
47 |
+
|
48 |
+
/**
|
49 |
+
* The number of previous buffers to keep, to ensure the audio is padded correctly
|
50 |
+
*/
|
51 |
+
export const MAX_NUM_PREV_BUFFERS = Math.ceil(
|
52 |
+
SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE,
|
53 |
+
);
|
src/index.css
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@import "tailwindcss";
|
2 |
+
|
3 |
+
@keyframes ripple {
|
4 |
+
from {
|
5 |
+
transform: scale(1);
|
6 |
+
opacity: 0.7;
|
7 |
+
}
|
8 |
+
to {
|
9 |
+
transform: scale(2);
|
10 |
+
opacity: 0;
|
11 |
+
}
|
12 |
+
}
|
src/main.jsx
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { StrictMode } from "react";
|
2 |
+
import { createRoot } from "react-dom/client";
|
3 |
+
import "./index.css";
|
4 |
+
import App from "./App.jsx";
|
5 |
+
|
6 |
+
createRoot(document.getElementById("root")).render(
|
7 |
+
<StrictMode>
|
8 |
+
<App />
|
9 |
+
</StrictMode>,
|
10 |
+
);
|
src/play-worklet.js
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export default () => {
|
2 |
+
class BufferedAudioWorkletProcessor extends AudioWorkletProcessor {
|
3 |
+
constructor() {
|
4 |
+
super();
|
5 |
+
this.bufferQueue = [];
|
6 |
+
this.currentChunkOffset = 0;
|
7 |
+
this.hadData = false;
|
8 |
+
|
9 |
+
this.port.onmessage = (event) => {
|
10 |
+
const data = event.data;
|
11 |
+
if (data instanceof Float32Array) {
|
12 |
+
this.hadData = true;
|
13 |
+
this.bufferQueue.push(data);
|
14 |
+
} else if (data === "stop") {
|
15 |
+
this.bufferQueue = [];
|
16 |
+
this.currentChunkOffset = 0;
|
17 |
+
}
|
18 |
+
};
|
19 |
+
}
|
20 |
+
|
21 |
+
process(inputs, outputs) {
|
22 |
+
const channel = outputs[0][0];
|
23 |
+
if (!channel) return true;
|
24 |
+
|
25 |
+
const numSamples = channel.length;
|
26 |
+
let outputIndex = 0;
|
27 |
+
|
28 |
+
if (this.hadData && this.bufferQueue.length === 0) {
|
29 |
+
this.port.postMessage({ type: "playback_ended" });
|
30 |
+
this.hadData = false;
|
31 |
+
}
|
32 |
+
|
33 |
+
while (outputIndex < numSamples) {
|
34 |
+
if (this.bufferQueue.length > 0) {
|
35 |
+
const currentChunk = this.bufferQueue[0];
|
36 |
+
const remainingSamples =
|
37 |
+
currentChunk.length - this.currentChunkOffset;
|
38 |
+
const samplesToCopy = Math.min(
|
39 |
+
remainingSamples,
|
40 |
+
numSamples - outputIndex,
|
41 |
+
);
|
42 |
+
|
43 |
+
channel.set(
|
44 |
+
currentChunk.subarray(
|
45 |
+
this.currentChunkOffset,
|
46 |
+
this.currentChunkOffset + samplesToCopy,
|
47 |
+
),
|
48 |
+
outputIndex,
|
49 |
+
);
|
50 |
+
|
51 |
+
this.currentChunkOffset += samplesToCopy;
|
52 |
+
outputIndex += samplesToCopy;
|
53 |
+
|
54 |
+
// Remove the chunk if fully consumed.
|
55 |
+
if (this.currentChunkOffset >= currentChunk.length) {
|
56 |
+
this.bufferQueue.shift();
|
57 |
+
this.currentChunkOffset = 0;
|
58 |
+
}
|
59 |
+
} else {
|
60 |
+
// If no data is available, fill the rest of the buffer with silence.
|
61 |
+
channel.fill(0, outputIndex);
|
62 |
+
outputIndex = numSamples;
|
63 |
+
}
|
64 |
+
}
|
65 |
+
return true;
|
66 |
+
}
|
67 |
+
}
|
68 |
+
|
69 |
+
registerProcessor(
|
70 |
+
"buffered-audio-worklet-processor",
|
71 |
+
BufferedAudioWorkletProcessor,
|
72 |
+
);
|
73 |
+
};
|
src/vad-processor.js
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const MIN_CHUNK_SIZE = 512;
|
2 |
+
let globalPointer = 0;
|
3 |
+
let globalBuffer = new Float32Array(MIN_CHUNK_SIZE);
|
4 |
+
|
5 |
+
class VADProcessor extends AudioWorkletProcessor {
|
6 |
+
process(inputs, outputs, parameters) {
|
7 |
+
const buffer = inputs[0][0];
|
8 |
+
if (!buffer) return; // buffer is null when the stream ends
|
9 |
+
|
10 |
+
if (buffer.length > MIN_CHUNK_SIZE) {
|
11 |
+
// If the buffer is larger than the minimum chunk size, send the entire buffer
|
12 |
+
this.port.postMessage({ buffer });
|
13 |
+
} else {
|
14 |
+
const remaining = MIN_CHUNK_SIZE - globalPointer;
|
15 |
+
if (buffer.length >= remaining) {
|
16 |
+
// If the buffer is larger than (or equal to) the remaining space in the global buffer, copy the remaining space
|
17 |
+
globalBuffer.set(buffer.subarray(0, remaining), globalPointer);
|
18 |
+
|
19 |
+
// Send the global buffer
|
20 |
+
this.port.postMessage({ buffer: globalBuffer });
|
21 |
+
|
22 |
+
// Reset the global buffer and set the remaining buffer
|
23 |
+
globalBuffer.fill(0);
|
24 |
+
globalBuffer.set(buffer.subarray(remaining), 0);
|
25 |
+
globalPointer = buffer.length - remaining;
|
26 |
+
} else {
|
27 |
+
// If the buffer is smaller than the remaining space in the global buffer, copy the buffer to the global buffer
|
28 |
+
globalBuffer.set(buffer, globalPointer);
|
29 |
+
globalPointer += buffer.length;
|
30 |
+
}
|
31 |
+
}
|
32 |
+
|
33 |
+
return true; // Keep the processor alive
|
34 |
+
}
|
35 |
+
}
|
36 |
+
|
37 |
+
registerProcessor("vad-processor", VADProcessor);
|
src/worker.js
ADDED
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import {
|
2 |
+
// VAD
|
3 |
+
AutoModel,
|
4 |
+
|
5 |
+
// LLM
|
6 |
+
AutoTokenizer,
|
7 |
+
AutoModelForCausalLM,
|
8 |
+
TextStreamer,
|
9 |
+
InterruptableStoppingCriteria,
|
10 |
+
|
11 |
+
// Speech recognition
|
12 |
+
Tensor,
|
13 |
+
pipeline,
|
14 |
+
} from "@huggingface/transformers";
|
15 |
+
|
16 |
+
import { KokoroTTS, TextSplitterStream } from "kokoro-js";
|
17 |
+
|
18 |
+
import {
|
19 |
+
MAX_BUFFER_DURATION,
|
20 |
+
INPUT_SAMPLE_RATE,
|
21 |
+
SPEECH_THRESHOLD,
|
22 |
+
EXIT_THRESHOLD,
|
23 |
+
SPEECH_PAD_SAMPLES,
|
24 |
+
MAX_NUM_PREV_BUFFERS,
|
25 |
+
MIN_SILENCE_DURATION_SAMPLES,
|
26 |
+
MIN_SPEECH_DURATION_SAMPLES,
|
27 |
+
} from "./constants";
|
28 |
+
|
29 |
+
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
|
30 |
+
let voice;
|
31 |
+
const tts = await KokoroTTS.from_pretrained(model_id, {
|
32 |
+
dtype: "fp32",
|
33 |
+
device: "webgpu",
|
34 |
+
});
|
35 |
+
|
36 |
+
const device = "webgpu";
|
37 |
+
self.postMessage({ type: "info", message: `Using device: "${device}"` });
|
38 |
+
self.postMessage({
|
39 |
+
type: "info",
|
40 |
+
message: "Loading models...",
|
41 |
+
duration: "until_next",
|
42 |
+
});
|
43 |
+
|
44 |
+
// Load models
|
45 |
+
const silero_vad = await AutoModel.from_pretrained(
|
46 |
+
"onnx-community/silero-vad",
|
47 |
+
{
|
48 |
+
config: { model_type: "custom" },
|
49 |
+
dtype: "fp32", // Full-precision
|
50 |
+
},
|
51 |
+
).catch((error) => {
|
52 |
+
self.postMessage({ error });
|
53 |
+
throw error;
|
54 |
+
});
|
55 |
+
|
56 |
+
const DEVICE_DTYPE_CONFIGS = {
|
57 |
+
webgpu: {
|
58 |
+
encoder_model: "fp32",
|
59 |
+
decoder_model_merged: "fp32",
|
60 |
+
},
|
61 |
+
wasm: {
|
62 |
+
encoder_model: "fp32",
|
63 |
+
decoder_model_merged: "q8",
|
64 |
+
},
|
65 |
+
};
|
66 |
+
const transcriber = await pipeline(
|
67 |
+
"automatic-speech-recognition",
|
68 |
+
"onnx-community/whisper-base", // or "onnx-community/moonshine-base-ONNX",
|
69 |
+
{
|
70 |
+
device,
|
71 |
+
dtype: DEVICE_DTYPE_CONFIGS[device],
|
72 |
+
},
|
73 |
+
).catch((error) => {
|
74 |
+
self.postMessage({ error });
|
75 |
+
throw error;
|
76 |
+
});
|
77 |
+
|
78 |
+
await transcriber(new Float32Array(INPUT_SAMPLE_RATE)); // Compile shaders
|
79 |
+
|
80 |
+
const llm_model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct";
|
81 |
+
const tokenizer = await AutoTokenizer.from_pretrained(llm_model_id);
|
82 |
+
const llm = await AutoModelForCausalLM.from_pretrained(llm_model_id, {
|
83 |
+
dtype: "q4f16",
|
84 |
+
device: "webgpu",
|
85 |
+
});
|
86 |
+
|
87 |
+
const SYSTEM_MESSAGE = {
|
88 |
+
role: "system",
|
89 |
+
content:
|
90 |
+
"You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual.",
|
91 |
+
};
|
92 |
+
await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 }); // Compile shaders
|
93 |
+
|
94 |
+
let messages = [SYSTEM_MESSAGE];
|
95 |
+
let past_key_values_cache;
|
96 |
+
let stopping_criteria;
|
97 |
+
self.postMessage({
|
98 |
+
type: "status",
|
99 |
+
status: "ready",
|
100 |
+
message: "Ready!",
|
101 |
+
voices: tts.voices,
|
102 |
+
});
|
103 |
+
|
104 |
+
// Global audio buffer to store incoming audio
|
105 |
+
const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
|
106 |
+
let bufferPointer = 0;
|
107 |
+
|
108 |
+
// Initial state for VAD
|
109 |
+
const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
|
110 |
+
let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
|
111 |
+
|
112 |
+
// Whether we are in the process of adding audio to the buffer
|
113 |
+
let isRecording = false;
|
114 |
+
let isPlaying = false; // new flag
|
115 |
+
|
116 |
+
/**
|
117 |
+
* Perform Voice Activity Detection (VAD)
|
118 |
+
* @param {Float32Array} buffer The new audio buffer
|
119 |
+
* @returns {Promise<boolean>} `true` if the buffer is speech, `false` otherwise.
|
120 |
+
*/
|
121 |
+
async function vad(buffer) {
|
122 |
+
const input = new Tensor("float32", buffer, [1, buffer.length]);
|
123 |
+
|
124 |
+
const { stateN, output } = await silero_vad({ input, sr, state });
|
125 |
+
state = stateN; // Update state
|
126 |
+
|
127 |
+
const isSpeech = output.data[0];
|
128 |
+
|
129 |
+
// Use heuristics to determine if the buffer is speech or not
|
130 |
+
return (
|
131 |
+
// Case 1: We are above the threshold (definitely speech)
|
132 |
+
isSpeech > SPEECH_THRESHOLD ||
|
133 |
+
// Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
|
134 |
+
(isRecording && isSpeech >= EXIT_THRESHOLD)
|
135 |
+
);
|
136 |
+
}
|
137 |
+
|
138 |
+
/**
|
139 |
+
* Transcribe the audio buffer
|
140 |
+
* @param {Float32Array} buffer The audio buffer
|
141 |
+
* @param {Object} data Additional data
|
142 |
+
*/
|
143 |
+
const speechToSpeech = async (buffer, data) => {
|
144 |
+
isPlaying = true;
|
145 |
+
|
146 |
+
// 1. Transcribe the audio from the user
|
147 |
+
const text = await transcriber(buffer).then(({ text }) => text.trim());
|
148 |
+
if (["", "[BLANK_AUDIO]"].includes(text)) {
|
149 |
+
// If the transcription is empty or a blank audio, we skip the rest of the processing
|
150 |
+
return;
|
151 |
+
}
|
152 |
+
messages.push({ role: "user", content: text });
|
153 |
+
|
154 |
+
// Set up text-to-speech streaming
|
155 |
+
const splitter = new TextSplitterStream();
|
156 |
+
const stream = tts.stream(splitter, {
|
157 |
+
voice,
|
158 |
+
});
|
159 |
+
(async () => {
|
160 |
+
for await (const { text, phonemes, audio } of stream) {
|
161 |
+
self.postMessage({ type: "output", text, result: audio });
|
162 |
+
}
|
163 |
+
})();
|
164 |
+
|
165 |
+
// 2. Generate a response using the LLM
|
166 |
+
const inputs = tokenizer.apply_chat_template(messages, {
|
167 |
+
add_generation_prompt: true,
|
168 |
+
return_dict: true,
|
169 |
+
});
|
170 |
+
const streamer = new TextStreamer(tokenizer, {
|
171 |
+
skip_prompt: true,
|
172 |
+
skip_special_tokens: true,
|
173 |
+
callback_function: (text) => {
|
174 |
+
splitter.push(text);
|
175 |
+
},
|
176 |
+
token_callback_function: () => {},
|
177 |
+
});
|
178 |
+
|
179 |
+
stopping_criteria = new InterruptableStoppingCriteria();
|
180 |
+
const { past_key_values, sequences } = await llm.generate({
|
181 |
+
...inputs,
|
182 |
+
past_key_values: past_key_values_cache,
|
183 |
+
|
184 |
+
do_sample: false, // TODO: do_sample: true is bugged (invalid data location on topk sample)
|
185 |
+
max_new_tokens: 1024,
|
186 |
+
streamer,
|
187 |
+
stopping_criteria,
|
188 |
+
return_dict_in_generate: true,
|
189 |
+
});
|
190 |
+
past_key_values_cache = past_key_values;
|
191 |
+
|
192 |
+
// Finally, close the stream to signal that no more text will be added.
|
193 |
+
splitter.close();
|
194 |
+
|
195 |
+
const decoded = tokenizer.batch_decode(
|
196 |
+
sequences.slice(null, [inputs.input_ids.dims[1], null]),
|
197 |
+
{ skip_special_tokens: true },
|
198 |
+
);
|
199 |
+
|
200 |
+
messages.push({ role: "assistant", content: decoded[0] });
|
201 |
+
};
|
202 |
+
|
203 |
+
// Track the number of samples after the last speech chunk
|
204 |
+
let postSpeechSamples = 0;
|
205 |
+
const resetAfterRecording = (offset = 0) => {
|
206 |
+
self.postMessage({
|
207 |
+
type: "status",
|
208 |
+
status: "recording_end",
|
209 |
+
message: "Transcribing...",
|
210 |
+
duration: "until_next",
|
211 |
+
});
|
212 |
+
BUFFER.fill(0, offset);
|
213 |
+
bufferPointer = offset;
|
214 |
+
isRecording = false;
|
215 |
+
postSpeechSamples = 0;
|
216 |
+
};
|
217 |
+
|
218 |
+
const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
|
219 |
+
// Get start and end time of the speech segment, minus the padding
|
220 |
+
const now = Date.now();
|
221 |
+
const end =
|
222 |
+
now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
|
223 |
+
const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
|
224 |
+
const duration = end - start;
|
225 |
+
const overflowLength = overflow?.length ?? 0;
|
226 |
+
|
227 |
+
// Send the audio buffer to the worker
|
228 |
+
const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
|
229 |
+
|
230 |
+
const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
|
231 |
+
const paddedBuffer = new Float32Array(prevLength + buffer.length);
|
232 |
+
let offset = 0;
|
233 |
+
for (const prev of prevBuffers) {
|
234 |
+
paddedBuffer.set(prev, offset);
|
235 |
+
offset += prev.length;
|
236 |
+
}
|
237 |
+
paddedBuffer.set(buffer, offset);
|
238 |
+
speechToSpeech(paddedBuffer, { start, end, duration });
|
239 |
+
|
240 |
+
// Set overflow (if present) and reset the rest of the audio buffer
|
241 |
+
if (overflow) {
|
242 |
+
BUFFER.set(overflow, 0);
|
243 |
+
}
|
244 |
+
resetAfterRecording(overflowLength);
|
245 |
+
};
|
246 |
+
|
247 |
+
let prevBuffers = [];
|
248 |
+
self.onmessage = async (event) => {
|
249 |
+
const { type, buffer } = event.data;
|
250 |
+
|
251 |
+
// refuse new audio while playing back
|
252 |
+
if (type === "audio" && isPlaying) return;
|
253 |
+
|
254 |
+
switch (type) {
|
255 |
+
case "start_call": {
|
256 |
+
const name = tts.voices[voice ?? "af_heart"]?.name ?? "Heart";
|
257 |
+
greet(`Hey there, my name is ${name}! How can I help you today?`);
|
258 |
+
return;
|
259 |
+
}
|
260 |
+
case "end_call":
|
261 |
+
messages = [SYSTEM_MESSAGE];
|
262 |
+
past_key_values_cache = null;
|
263 |
+
case "interrupt":
|
264 |
+
stopping_criteria?.interrupt();
|
265 |
+
return;
|
266 |
+
case "set_voice":
|
267 |
+
voice = event.data.voice;
|
268 |
+
return;
|
269 |
+
case "playback_ended":
|
270 |
+
isPlaying = false;
|
271 |
+
return;
|
272 |
+
}
|
273 |
+
|
274 |
+
const wasRecording = isRecording; // Save current state
|
275 |
+
const isSpeech = await vad(buffer);
|
276 |
+
|
277 |
+
if (!wasRecording && !isSpeech) {
|
278 |
+
// We are not recording, and the buffer is not speech,
|
279 |
+
// so we will probably discard the buffer. So, we insert
|
280 |
+
// into a FIFO queue with maximum size of PREV_BUFFER_SIZE
|
281 |
+
if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
|
282 |
+
// If the queue is full, we discard the oldest buffer
|
283 |
+
prevBuffers.shift();
|
284 |
+
}
|
285 |
+
prevBuffers.push(buffer);
|
286 |
+
return;
|
287 |
+
}
|
288 |
+
|
289 |
+
const remaining = BUFFER.length - bufferPointer;
|
290 |
+
if (buffer.length >= remaining) {
|
291 |
+
// The buffer is larger than (or equal to) the remaining space in the global buffer,
|
292 |
+
// so we perform transcription and copy the overflow to the global buffer
|
293 |
+
BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
|
294 |
+
bufferPointer += remaining;
|
295 |
+
|
296 |
+
// Dispatch the audio buffer
|
297 |
+
const overflow = buffer.subarray(remaining);
|
298 |
+
dispatchForTranscriptionAndResetAudioBuffer(overflow);
|
299 |
+
return;
|
300 |
+
} else {
|
301 |
+
// The buffer is smaller than the remaining space in the global buffer,
|
302 |
+
// so we copy it to the global buffer
|
303 |
+
BUFFER.set(buffer, bufferPointer);
|
304 |
+
bufferPointer += buffer.length;
|
305 |
+
}
|
306 |
+
|
307 |
+
if (isSpeech) {
|
308 |
+
if (!isRecording) {
|
309 |
+
// Indicate start of recording
|
310 |
+
self.postMessage({
|
311 |
+
type: "status",
|
312 |
+
status: "recording_start",
|
313 |
+
message: "Listening...",
|
314 |
+
duration: "until_next",
|
315 |
+
});
|
316 |
+
}
|
317 |
+
// Start or continue recording
|
318 |
+
isRecording = true;
|
319 |
+
postSpeechSamples = 0; // Reset the post-speech samples
|
320 |
+
return;
|
321 |
+
}
|
322 |
+
|
323 |
+
postSpeechSamples += buffer.length;
|
324 |
+
|
325 |
+
// At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
|
326 |
+
// So, we check whether we have reached the end of the current audio chunk.
|
327 |
+
if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
|
328 |
+
// There was a short pause, but not long enough to consider the end of a speech chunk
|
329 |
+
// (e.g., the speaker took a breath), so we continue recording
|
330 |
+
return;
|
331 |
+
}
|
332 |
+
|
333 |
+
if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
|
334 |
+
// The entire buffer (including the new chunk) is smaller than the minimum
|
335 |
+
// duration of a speech chunk, so we can safely discard the buffer.
|
336 |
+
resetAfterRecording();
|
337 |
+
return;
|
338 |
+
}
|
339 |
+
|
340 |
+
dispatchForTranscriptionAndResetAudioBuffer();
|
341 |
+
};
|
342 |
+
|
343 |
+
function greet(text) {
|
344 |
+
isPlaying = true;
|
345 |
+
const splitter = new TextSplitterStream();
|
346 |
+
const stream = tts.stream(splitter, { voice });
|
347 |
+
(async () => {
|
348 |
+
for await (const { text: chunkText, audio } of stream) {
|
349 |
+
self.postMessage({ type: "output", text: chunkText, result: audio });
|
350 |
+
}
|
351 |
+
})();
|
352 |
+
splitter.push(text);
|
353 |
+
splitter.close();
|
354 |
+
messages.push({ role: "assistant", content: text });
|
355 |
+
}
|
vite.config.js
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { defineConfig } from "vite";
|
2 |
+
import react from "@vitejs/plugin-react";
|
3 |
+
import tailwindcss from "@tailwindcss/vite";
|
4 |
+
|
5 |
+
// https://vite.dev/config/
|
6 |
+
export default defineConfig({
|
7 |
+
plugins: [tailwindcss(), react()],
|
8 |
+
build: {
|
9 |
+
target: "esnext",
|
10 |
+
},
|
11 |
+
worker: {
|
12 |
+
format: "es",
|
13 |
+
},
|
14 |
+
resolve: {
|
15 |
+
// Only bundle a single instance of Transformers.js
|
16 |
+
// (shared by `@huggingface/transformers` and `kokoro-js`)
|
17 |
+
dedupe: ["@huggingface/transformers"],
|
18 |
+
},
|
19 |
+
});
|