Xenova HF Staff commited on
Commit
e9129d0
·
verified ·
1 Parent(s): 8834b4b

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ public/logo.png filter=lfs diff=lfs merge=lfs -text
eslint.config.js ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import js from "@eslint/js";
2
+ import globals from "globals";
3
+ import reactHooks from "eslint-plugin-react-hooks";
4
+ import reactRefresh from "eslint-plugin-react-refresh";
5
+
6
+ export default [
7
+ { ignores: ["dist"] },
8
+ {
9
+ files: ["**/*.{js,jsx}"],
10
+ languageOptions: {
11
+ ecmaVersion: 2020,
12
+ globals: globals.browser,
13
+ parserOptions: {
14
+ ecmaVersion: "latest",
15
+ ecmaFeatures: { jsx: true },
16
+ sourceType: "module",
17
+ },
18
+ },
19
+ plugins: {
20
+ "react-hooks": reactHooks,
21
+ "react-refresh": reactRefresh,
22
+ },
23
+ rules: {
24
+ ...js.configs.recommended.rules,
25
+ ...reactHooks.configs.recommended.rules,
26
+ "no-unused-vars": ["error", { varsIgnorePattern: "^[A-Z_]" }],
27
+ "react-refresh/only-export-components": [
28
+ "warn",
29
+ { allowConstantExport: true },
30
+ ],
31
+ },
32
+ },
33
+ ];
index.html ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/png" href="/logo.png" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>Transformers.js | Speech-to-speech demo</title>
8
+ </head>
9
+ <body>
10
+ <div id="root"></div>
11
+ <script type="module" src="/src/main.jsx"></script>
12
+ </body>
13
+ </html>
package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
package.json CHANGED
@@ -1,39 +1,32 @@
1
  {
2
- "name": "react-template",
3
- "version": "0.1.0",
4
  "private": true,
5
- "dependencies": {
6
- "@testing-library/dom": "^10.4.0",
7
- "@testing-library/jest-dom": "^6.6.3",
8
- "@testing-library/react": "^16.3.0",
9
- "@testing-library/user-event": "^13.5.0",
10
- "react": "^19.1.0",
11
- "react-dom": "^19.1.0",
12
- "react-scripts": "5.0.1",
13
- "web-vitals": "^2.1.4"
14
- },
15
  "scripts": {
16
- "start": "react-scripts start",
17
- "build": "react-scripts build",
18
- "test": "react-scripts test",
19
- "eject": "react-scripts eject"
20
  },
21
- "eslintConfig": {
22
- "extends": [
23
- "react-app",
24
- "react-app/jest"
25
- ]
 
 
 
26
  },
27
- "browserslist": {
28
- "production": [
29
- ">0.2%",
30
- "not dead",
31
- "not op_mini all"
32
- ],
33
- "development": [
34
- "last 1 chrome version",
35
- "last 1 firefox version",
36
- "last 1 safari version"
37
- ]
38
  }
39
  }
 
1
  {
2
+ "name": "speech-to-speech",
 
3
  "private": true,
4
+ "version": "0.0.0",
5
+ "type": "module",
 
 
 
 
 
 
 
 
6
  "scripts": {
7
+ "dev": "vite",
8
+ "build": "vite build",
9
+ "lint": "eslint .",
10
+ "preview": "vite preview"
11
  },
12
+ "dependencies": {
13
+ "@huggingface/transformers": "^3.5.2",
14
+ "@tailwindcss/vite": "^4.1.4",
15
+ "kokoro-js": "^1.2.1",
16
+ "lucide-react": "^0.503.0",
17
+ "react": "^19.0.0",
18
+ "react-dom": "^19.0.0",
19
+ "tailwindcss": "^4.1.4"
20
  },
21
+ "devDependencies": {
22
+ "@eslint/js": "^9.22.0",
23
+ "@types/react": "^19.0.10",
24
+ "@types/react-dom": "^19.0.4",
25
+ "@vitejs/plugin-react": "^4.3.4",
26
+ "eslint": "^9.22.0",
27
+ "eslint-plugin-react-hooks": "^5.2.0",
28
+ "eslint-plugin-react-refresh": "^0.4.19",
29
+ "globals": "^16.0.0",
30
+ "vite": "^6.3.1"
 
31
  }
32
  }
public/logo.png ADDED

Git LFS Details

  • SHA256: 9fb2bd90d1eeab88414681bb80464bc723ab57fb2c5b2f33367f16a0157ed5c0
  • Pointer size: 131 Bytes
  • Size of remote file: 634 kB
src/App.jsx ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect, useState, useRef } from "react";
2
+ import { Mic, PhoneOff, ChevronDown } from "lucide-react";
3
+ import { INPUT_SAMPLE_RATE } from "./constants";
4
+
5
+ import WORKLET from "./play-worklet.js";
6
+
7
+ export default function App() {
8
+ const [callStartTime, setCallStartTime] = useState(null);
9
+ const [callStarted, setCallStarted] = useState(false);
10
+ const [playing, setPlaying] = useState(false);
11
+
12
+ const [voice, setVoice] = useState("af_heart");
13
+ const [voices, setVoices] = useState([]);
14
+
15
+ const [isListening, setIsListening] = useState(false);
16
+ const [isSpeaking, setIsSpeaking] = useState(false);
17
+ const [listeningScale, setListeningScale] = useState(1);
18
+ const [speakingScale, setSpeakingScale] = useState(1);
19
+ const [ripples, setRipples] = useState([]);
20
+
21
+ const [ready, setReady] = useState(false);
22
+ const [error, setError] = useState(null);
23
+ const [elapsedTime, setElapsedTime] = useState("00:00");
24
+ const worker = useRef(null);
25
+
26
+ const node = useRef(null);
27
+
28
+ useEffect(() => {
29
+ worker.current?.postMessage({
30
+ type: "set_voice",
31
+ voice,
32
+ });
33
+ }, [voice]);
34
+
35
+ useEffect(() => {
36
+ if (!callStarted) {
37
+ // Reset worker state after call ends
38
+ worker.current?.postMessage({
39
+ type: "end_call",
40
+ });
41
+ }
42
+ }, [callStarted]);
43
+
44
+ useEffect(() => {
45
+ if (callStarted && callStartTime) {
46
+ const interval = setInterval(() => {
47
+ const diff = Math.floor((Date.now() - callStartTime) / 1000);
48
+ const minutes = String(Math.floor(diff / 60)).padStart(2, "0");
49
+ const seconds = String(diff % 60).padStart(2, "0");
50
+ setElapsedTime(`${minutes}:${seconds}`);
51
+ }, 1000);
52
+ return () => clearInterval(interval);
53
+ } else {
54
+ setElapsedTime("00:00");
55
+ }
56
+ }, [callStarted, callStartTime]);
57
+
58
+ useEffect(() => {
59
+ worker.current ??= new Worker(new URL("./worker.js", import.meta.url), {
60
+ type: "module",
61
+ });
62
+
63
+ const onMessage = ({ data }) => {
64
+ console.log("Worker message:", data);
65
+ if (data.error) {
66
+ return onError(data.error);
67
+ }
68
+
69
+ switch (data.type) {
70
+ case "status":
71
+ if (data.status === "recording_start") {
72
+ setIsListening(true);
73
+ setIsSpeaking(false);
74
+ } else if (data.status === "recording_end") {
75
+ setIsListening(false);
76
+ } else if (data.status === "ready") {
77
+ setVoices(data.voices);
78
+ setReady(true);
79
+ }
80
+ break;
81
+ case "output":
82
+ if (!playing) {
83
+ node.current?.port.postMessage(data.result.audio);
84
+ setPlaying(true);
85
+ setIsSpeaking(true);
86
+ setIsListening(false);
87
+ }
88
+ break;
89
+ }
90
+ };
91
+ const onError = (err) => setError(err.message);
92
+
93
+ worker.current.addEventListener("message", onMessage);
94
+ worker.current.addEventListener("error", onError);
95
+
96
+ return () => {
97
+ worker.current.removeEventListener("message", onMessage);
98
+ worker.current.removeEventListener("error", onError);
99
+ };
100
+ }, []);
101
+
102
+ useEffect(() => {
103
+ if (!callStarted) return;
104
+
105
+ let worklet;
106
+ let inputAudioContext;
107
+ let source;
108
+ let ignore = false;
109
+
110
+ let outputAudioContext;
111
+ const audioStreamPromise = navigator.mediaDevices.getUserMedia({
112
+ audio: {
113
+ channelCount: 1,
114
+ echoCancellation: true,
115
+ autoGainControl: true,
116
+ noiseSuppression: true,
117
+ sampleRate: INPUT_SAMPLE_RATE,
118
+ },
119
+ });
120
+
121
+ audioStreamPromise
122
+ .then(async (stream) => {
123
+ if (ignore) return;
124
+
125
+ inputAudioContext = new (window.AudioContext ||
126
+ window.webkitAudioContext)({
127
+ sampleRate: INPUT_SAMPLE_RATE,
128
+ });
129
+
130
+ const analyser = inputAudioContext.createAnalyser();
131
+ analyser.fftSize = 256;
132
+ source = inputAudioContext.createMediaStreamSource(stream);
133
+ source.connect(analyser);
134
+
135
+ const inputDataArray = new Uint8Array(analyser.frequencyBinCount);
136
+
137
+ function calculateRMS(array) {
138
+ let sum = 0;
139
+ for (let i = 0; i < array.length; ++i) {
140
+ const normalized = array[i] / 128 - 1;
141
+ sum += normalized * normalized;
142
+ }
143
+ const rms = Math.sqrt(sum / array.length);
144
+ return rms;
145
+ }
146
+
147
+ await inputAudioContext.audioWorklet.addModule(
148
+ new URL("./vad-processor.js", import.meta.url),
149
+ );
150
+ worklet = new AudioWorkletNode(inputAudioContext, "vad-processor", {
151
+ numberOfInputs: 1,
152
+ numberOfOutputs: 0,
153
+ channelCount: 1,
154
+ channelCountMode: "explicit",
155
+ channelInterpretation: "discrete",
156
+ });
157
+
158
+ source.connect(worklet);
159
+ worklet.port.onmessage = (event) => {
160
+ const { buffer } = event.data;
161
+ worker.current?.postMessage({ type: "audio", buffer });
162
+ };
163
+
164
+ outputAudioContext = new AudioContext({
165
+ sampleRate: 24000,
166
+ });
167
+ outputAudioContext.resume();
168
+
169
+ const blob = new Blob([`(${WORKLET.toString()})()`], {
170
+ type: "application/javascript",
171
+ });
172
+ const url = URL.createObjectURL(blob);
173
+ await outputAudioContext.audioWorklet.addModule(url);
174
+ URL.revokeObjectURL(url);
175
+
176
+ node.current = new AudioWorkletNode(
177
+ outputAudioContext,
178
+ "buffered-audio-worklet-processor",
179
+ );
180
+
181
+ node.current.port.onmessage = (event) => {
182
+ if (event.data.type === "playback_ended") {
183
+ setPlaying(false);
184
+ setIsSpeaking(false);
185
+ worker.current?.postMessage({ type: "playback_ended" });
186
+ }
187
+ };
188
+
189
+ const outputAnalyser = outputAudioContext.createAnalyser();
190
+ outputAnalyser.fftSize = 256;
191
+
192
+ node.current.connect(outputAnalyser);
193
+ outputAnalyser.connect(outputAudioContext.destination);
194
+
195
+ const outputDataArray = new Uint8Array(
196
+ outputAnalyser.frequencyBinCount,
197
+ );
198
+
199
+ function updateVisualizers() {
200
+ analyser.getByteTimeDomainData(inputDataArray);
201
+ const rms = calculateRMS(inputDataArray);
202
+ const targetScale = 1 + Math.min(1.25 * rms, 0.25);
203
+ setListeningScale((prev) => prev + (targetScale - prev) * 0.25);
204
+
205
+ outputAnalyser.getByteTimeDomainData(outputDataArray);
206
+ const outputRMS = calculateRMS(outputDataArray);
207
+ const targetOutputScale = 1 + Math.min(1.25 * outputRMS, 0.25);
208
+ setSpeakingScale((prev) => prev + (targetOutputScale - prev) * 0.25);
209
+
210
+ requestAnimationFrame(updateVisualizers);
211
+ }
212
+ updateVisualizers();
213
+ })
214
+ .catch((err) => {
215
+ setError(err.message);
216
+ console.error(err);
217
+ });
218
+
219
+ return () => {
220
+ ignore = true;
221
+
222
+ audioStreamPromise.then((stream) =>
223
+ stream.getTracks().forEach((track) => track.stop()),
224
+ );
225
+ source?.disconnect();
226
+ worklet?.disconnect();
227
+ inputAudioContext?.close();
228
+
229
+ outputAudioContext?.close();
230
+ };
231
+ }, [callStarted]);
232
+
233
+ useEffect(() => {
234
+ if (!callStarted) return;
235
+ const interval = setInterval(() => {
236
+ const id = Date.now();
237
+ setRipples((prev) => [...prev, id]);
238
+ setTimeout(() => {
239
+ setRipples((prev) => prev.filter((r) => r !== id));
240
+ }, 1500);
241
+ }, 1000);
242
+ return () => clearInterval(interval);
243
+ }, [callStarted]);
244
+
245
+ return (
246
+ <div className="h-screen min-h-[240px] flex items-center justify-center bg-gray-50 p-4 relative">
247
+ <div className="h-full max-h-[320px] w-[640px] bg-white rounded-xl shadow-lg p-8 flex items-center justify-between space-x-16">
248
+ <div className="text-green-700 w-[140px]">
249
+ <div className="text-xl font-bold flex justify-between">
250
+ {voices?.[voice]?.name}
251
+ <span className="font-normal text-gray-500">{elapsedTime}</span>
252
+ </div>
253
+ <div className="text-base relative">
254
+ <button
255
+ type="button"
256
+ disabled={!ready}
257
+ className={`w-full flex items-center justify-between border border-gray-300 rounded-md transition-colors ${
258
+ ready
259
+ ? "bg-transparent hover:border-gray-400"
260
+ : "bg-gray-100 opacity-50 cursor-not-allowed"
261
+ }`}
262
+ >
263
+ <span className="px-2 py-1">Select voice</span>
264
+ <ChevronDown className="absolute right-2" />
265
+ </button>
266
+ <select
267
+ value={voice}
268
+ onChange={(e) => setVoice(e.target.value)}
269
+ className="absolute inset-0 opacity-0 cursor-pointer"
270
+ disabled={!ready}
271
+ >
272
+ {Object.entries(voices).map(([key, v]) => (
273
+ <option key={key} value={key}>
274
+ {`${v.name} (${
275
+ v.language === "en-us" ? "American" : v.language
276
+ } ${v.gender})`}
277
+ </option>
278
+ ))}
279
+ </select>
280
+ </div>
281
+ </div>
282
+
283
+ <div className="relative flex items-center justify-center w-32 h-32 flex-shrink-0 aspect-square">
284
+ {callStarted &&
285
+ ripples.map((id) => (
286
+ <div
287
+ key={id}
288
+ className="absolute inset-0 rounded-full border-2 border-green-200 pointer-events-none"
289
+ style={{ animation: "ripple 1.5s ease-out forwards" }}
290
+ />
291
+ ))}
292
+ <div className="absolute z-10 text-lg text-gray-700">
293
+ {!ready ? "Loading..." : ""}
294
+ {isListening && "Listening..."}
295
+ {isSpeaking && "Speaking..."}
296
+ </div>
297
+ {/* Pulsing loader while initializing */}
298
+ <div
299
+ className={`absolute w-32 h-32 rounded-full bg-green-200 ${
300
+ !ready ? "animate-ping opacity-75" : ""
301
+ }`}
302
+ style={{ animationDuration: "1.5s" }}
303
+ />
304
+ {/* Main rings */}
305
+ <div
306
+ className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-300 ${
307
+ !ready ? "opacity-0" : ""
308
+ }`}
309
+ style={{ transform: `scale(${speakingScale})` }}
310
+ />
311
+ <div
312
+ className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-200 ${
313
+ !ready ? "opacity-0" : ""
314
+ }`}
315
+ style={{ transform: `scale(${listeningScale})` }}
316
+ />
317
+ </div>
318
+
319
+ <div className="space-y-4 w-[140px]">
320
+ {callStarted ? (
321
+ <button
322
+ className="flex items-center space-x-2 px-4 py-2 bg-red-100 text-red-700 rounded-md hover:bg-red-200"
323
+ onClick={() => {
324
+ setCallStarted(false);
325
+ setCallStartTime(null);
326
+ setPlaying(false);
327
+ setIsListening(false);
328
+ setIsSpeaking(false);
329
+ }}
330
+ >
331
+ <PhoneOff className="w-5 h-5" />
332
+ <span>End call</span>
333
+ </button>
334
+ ) : (
335
+ <button
336
+ className={`flex items-center space-x-2 px-4 py-2 rounded-md ${
337
+ ready
338
+ ? "bg-blue-100 text-blue-700 hover:bg-blue-200"
339
+ : "bg-blue-100 text-blue-700 opacity-50 cursor-not-allowed"
340
+ }`}
341
+ onClick={() => {
342
+ setCallStartTime(Date.now());
343
+ setCallStarted(true);
344
+ worker.current?.postMessage({ type: "start_call" });
345
+ }}
346
+ disabled={!ready}
347
+ >
348
+ <span>Start call</span>
349
+ </button>
350
+ )}
351
+ </div>
352
+ </div>
353
+
354
+ <div className="absolute bottom-4 text-sm">
355
+ Built with{" "}
356
+ <a
357
+ href="https://github.com/huggingface/transformers.js"
358
+ rel="noopener noreferrer"
359
+ target="_blank"
360
+ className="text-blue-600 hover:underline"
361
+ >
362
+ 🤗 Transformers.js
363
+ </a>
364
+ </div>
365
+ </div>
366
+ );
367
+ }
src/constants.js ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Sample rate of the input audio.
3
+ * Coindicentally, this is the same for both models (Moonshine and Silero VAD)
4
+ */
5
+ export const INPUT_SAMPLE_RATE = 16000;
6
+ const INPUT_SAMPLE_RATE_MS = INPUT_SAMPLE_RATE / 1000;
7
+
8
+ /**
9
+ * Probabilities ABOVE this value are considered as SPEECH
10
+ */
11
+ export const SPEECH_THRESHOLD = 0.3;
12
+
13
+ /**
14
+ * If current state is SPEECH, and the probability of the next state
15
+ * is below this value, it is considered as NON-SPEECH.
16
+ */
17
+ export const EXIT_THRESHOLD = 0.1;
18
+
19
+ /**
20
+ * After each speech chunk, wait for at least this amount of silence
21
+ * before considering the next chunk as a new speech chunk
22
+ */
23
+ export const MIN_SILENCE_DURATION_MS = 400;
24
+ export const MIN_SILENCE_DURATION_SAMPLES =
25
+ MIN_SILENCE_DURATION_MS * INPUT_SAMPLE_RATE_MS;
26
+
27
+ /**
28
+ * Pad the speech chunk with this amount each side
29
+ */
30
+ export const SPEECH_PAD_MS = 80;
31
+ export const SPEECH_PAD_SAMPLES = SPEECH_PAD_MS * INPUT_SAMPLE_RATE_MS;
32
+
33
+ /**
34
+ * Final speech chunks below this duration are discarded
35
+ */
36
+ export const MIN_SPEECH_DURATION_SAMPLES = 250 * INPUT_SAMPLE_RATE_MS; // 250 ms
37
+
38
+ /**
39
+ * Maximum duration of audio that can be handled by Moonshine
40
+ */
41
+ export const MAX_BUFFER_DURATION = 30;
42
+
43
+ /**
44
+ * Size of the incoming buffers
45
+ */
46
+ export const NEW_BUFFER_SIZE = 512;
47
+
48
+ /**
49
+ * The number of previous buffers to keep, to ensure the audio is padded correctly
50
+ */
51
+ export const MAX_NUM_PREV_BUFFERS = Math.ceil(
52
+ SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE,
53
+ );
src/index.css ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @import "tailwindcss";
2
+
3
+ @keyframes ripple {
4
+ from {
5
+ transform: scale(1);
6
+ opacity: 0.7;
7
+ }
8
+ to {
9
+ transform: scale(2);
10
+ opacity: 0;
11
+ }
12
+ }
src/main.jsx ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import { StrictMode } from "react";
2
+ import { createRoot } from "react-dom/client";
3
+ import "./index.css";
4
+ import App from "./App.jsx";
5
+
6
+ createRoot(document.getElementById("root")).render(
7
+ <StrictMode>
8
+ <App />
9
+ </StrictMode>,
10
+ );
src/play-worklet.js ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export default () => {
2
+ class BufferedAudioWorkletProcessor extends AudioWorkletProcessor {
3
+ constructor() {
4
+ super();
5
+ this.bufferQueue = [];
6
+ this.currentChunkOffset = 0;
7
+ this.hadData = false;
8
+
9
+ this.port.onmessage = (event) => {
10
+ const data = event.data;
11
+ if (data instanceof Float32Array) {
12
+ this.hadData = true;
13
+ this.bufferQueue.push(data);
14
+ } else if (data === "stop") {
15
+ this.bufferQueue = [];
16
+ this.currentChunkOffset = 0;
17
+ }
18
+ };
19
+ }
20
+
21
+ process(inputs, outputs) {
22
+ const channel = outputs[0][0];
23
+ if (!channel) return true;
24
+
25
+ const numSamples = channel.length;
26
+ let outputIndex = 0;
27
+
28
+ if (this.hadData && this.bufferQueue.length === 0) {
29
+ this.port.postMessage({ type: "playback_ended" });
30
+ this.hadData = false;
31
+ }
32
+
33
+ while (outputIndex < numSamples) {
34
+ if (this.bufferQueue.length > 0) {
35
+ const currentChunk = this.bufferQueue[0];
36
+ const remainingSamples =
37
+ currentChunk.length - this.currentChunkOffset;
38
+ const samplesToCopy = Math.min(
39
+ remainingSamples,
40
+ numSamples - outputIndex,
41
+ );
42
+
43
+ channel.set(
44
+ currentChunk.subarray(
45
+ this.currentChunkOffset,
46
+ this.currentChunkOffset + samplesToCopy,
47
+ ),
48
+ outputIndex,
49
+ );
50
+
51
+ this.currentChunkOffset += samplesToCopy;
52
+ outputIndex += samplesToCopy;
53
+
54
+ // Remove the chunk if fully consumed.
55
+ if (this.currentChunkOffset >= currentChunk.length) {
56
+ this.bufferQueue.shift();
57
+ this.currentChunkOffset = 0;
58
+ }
59
+ } else {
60
+ // If no data is available, fill the rest of the buffer with silence.
61
+ channel.fill(0, outputIndex);
62
+ outputIndex = numSamples;
63
+ }
64
+ }
65
+ return true;
66
+ }
67
+ }
68
+
69
+ registerProcessor(
70
+ "buffered-audio-worklet-processor",
71
+ BufferedAudioWorkletProcessor,
72
+ );
73
+ };
src/vad-processor.js ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const MIN_CHUNK_SIZE = 512;
2
+ let globalPointer = 0;
3
+ let globalBuffer = new Float32Array(MIN_CHUNK_SIZE);
4
+
5
+ class VADProcessor extends AudioWorkletProcessor {
6
+ process(inputs, outputs, parameters) {
7
+ const buffer = inputs[0][0];
8
+ if (!buffer) return; // buffer is null when the stream ends
9
+
10
+ if (buffer.length > MIN_CHUNK_SIZE) {
11
+ // If the buffer is larger than the minimum chunk size, send the entire buffer
12
+ this.port.postMessage({ buffer });
13
+ } else {
14
+ const remaining = MIN_CHUNK_SIZE - globalPointer;
15
+ if (buffer.length >= remaining) {
16
+ // If the buffer is larger than (or equal to) the remaining space in the global buffer, copy the remaining space
17
+ globalBuffer.set(buffer.subarray(0, remaining), globalPointer);
18
+
19
+ // Send the global buffer
20
+ this.port.postMessage({ buffer: globalBuffer });
21
+
22
+ // Reset the global buffer and set the remaining buffer
23
+ globalBuffer.fill(0);
24
+ globalBuffer.set(buffer.subarray(remaining), 0);
25
+ globalPointer = buffer.length - remaining;
26
+ } else {
27
+ // If the buffer is smaller than the remaining space in the global buffer, copy the buffer to the global buffer
28
+ globalBuffer.set(buffer, globalPointer);
29
+ globalPointer += buffer.length;
30
+ }
31
+ }
32
+
33
+ return true; // Keep the processor alive
34
+ }
35
+ }
36
+
37
+ registerProcessor("vad-processor", VADProcessor);
src/worker.js ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ // VAD
3
+ AutoModel,
4
+
5
+ // LLM
6
+ AutoTokenizer,
7
+ AutoModelForCausalLM,
8
+ TextStreamer,
9
+ InterruptableStoppingCriteria,
10
+
11
+ // Speech recognition
12
+ Tensor,
13
+ pipeline,
14
+ } from "@huggingface/transformers";
15
+
16
+ import { KokoroTTS, TextSplitterStream } from "kokoro-js";
17
+
18
+ import {
19
+ MAX_BUFFER_DURATION,
20
+ INPUT_SAMPLE_RATE,
21
+ SPEECH_THRESHOLD,
22
+ EXIT_THRESHOLD,
23
+ SPEECH_PAD_SAMPLES,
24
+ MAX_NUM_PREV_BUFFERS,
25
+ MIN_SILENCE_DURATION_SAMPLES,
26
+ MIN_SPEECH_DURATION_SAMPLES,
27
+ } from "./constants";
28
+
29
+ const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
30
+ let voice;
31
+ const tts = await KokoroTTS.from_pretrained(model_id, {
32
+ dtype: "fp32",
33
+ device: "webgpu",
34
+ });
35
+
36
+ const device = "webgpu";
37
+ self.postMessage({ type: "info", message: `Using device: "${device}"` });
38
+ self.postMessage({
39
+ type: "info",
40
+ message: "Loading models...",
41
+ duration: "until_next",
42
+ });
43
+
44
+ // Load models
45
+ const silero_vad = await AutoModel.from_pretrained(
46
+ "onnx-community/silero-vad",
47
+ {
48
+ config: { model_type: "custom" },
49
+ dtype: "fp32", // Full-precision
50
+ },
51
+ ).catch((error) => {
52
+ self.postMessage({ error });
53
+ throw error;
54
+ });
55
+
56
+ const DEVICE_DTYPE_CONFIGS = {
57
+ webgpu: {
58
+ encoder_model: "fp32",
59
+ decoder_model_merged: "fp32",
60
+ },
61
+ wasm: {
62
+ encoder_model: "fp32",
63
+ decoder_model_merged: "q8",
64
+ },
65
+ };
66
+ const transcriber = await pipeline(
67
+ "automatic-speech-recognition",
68
+ "onnx-community/whisper-base", // or "onnx-community/moonshine-base-ONNX",
69
+ {
70
+ device,
71
+ dtype: DEVICE_DTYPE_CONFIGS[device],
72
+ },
73
+ ).catch((error) => {
74
+ self.postMessage({ error });
75
+ throw error;
76
+ });
77
+
78
+ await transcriber(new Float32Array(INPUT_SAMPLE_RATE)); // Compile shaders
79
+
80
+ const llm_model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct";
81
+ const tokenizer = await AutoTokenizer.from_pretrained(llm_model_id);
82
+ const llm = await AutoModelForCausalLM.from_pretrained(llm_model_id, {
83
+ dtype: "q4f16",
84
+ device: "webgpu",
85
+ });
86
+
87
+ const SYSTEM_MESSAGE = {
88
+ role: "system",
89
+ content:
90
+ "You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual.",
91
+ };
92
+ await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 }); // Compile shaders
93
+
94
+ let messages = [SYSTEM_MESSAGE];
95
+ let past_key_values_cache;
96
+ let stopping_criteria;
97
+ self.postMessage({
98
+ type: "status",
99
+ status: "ready",
100
+ message: "Ready!",
101
+ voices: tts.voices,
102
+ });
103
+
104
+ // Global audio buffer to store incoming audio
105
+ const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
106
+ let bufferPointer = 0;
107
+
108
+ // Initial state for VAD
109
+ const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
110
+ let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
111
+
112
+ // Whether we are in the process of adding audio to the buffer
113
+ let isRecording = false;
114
+ let isPlaying = false; // new flag
115
+
116
+ /**
117
+ * Perform Voice Activity Detection (VAD)
118
+ * @param {Float32Array} buffer The new audio buffer
119
+ * @returns {Promise<boolean>} `true` if the buffer is speech, `false` otherwise.
120
+ */
121
+ async function vad(buffer) {
122
+ const input = new Tensor("float32", buffer, [1, buffer.length]);
123
+
124
+ const { stateN, output } = await silero_vad({ input, sr, state });
125
+ state = stateN; // Update state
126
+
127
+ const isSpeech = output.data[0];
128
+
129
+ // Use heuristics to determine if the buffer is speech or not
130
+ return (
131
+ // Case 1: We are above the threshold (definitely speech)
132
+ isSpeech > SPEECH_THRESHOLD ||
133
+ // Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
134
+ (isRecording && isSpeech >= EXIT_THRESHOLD)
135
+ );
136
+ }
137
+
138
+ /**
139
+ * Transcribe the audio buffer
140
+ * @param {Float32Array} buffer The audio buffer
141
+ * @param {Object} data Additional data
142
+ */
143
+ const speechToSpeech = async (buffer, data) => {
144
+ isPlaying = true;
145
+
146
+ // 1. Transcribe the audio from the user
147
+ const text = await transcriber(buffer).then(({ text }) => text.trim());
148
+ if (["", "[BLANK_AUDIO]"].includes(text)) {
149
+ // If the transcription is empty or a blank audio, we skip the rest of the processing
150
+ return;
151
+ }
152
+ messages.push({ role: "user", content: text });
153
+
154
+ // Set up text-to-speech streaming
155
+ const splitter = new TextSplitterStream();
156
+ const stream = tts.stream(splitter, {
157
+ voice,
158
+ });
159
+ (async () => {
160
+ for await (const { text, phonemes, audio } of stream) {
161
+ self.postMessage({ type: "output", text, result: audio });
162
+ }
163
+ })();
164
+
165
+ // 2. Generate a response using the LLM
166
+ const inputs = tokenizer.apply_chat_template(messages, {
167
+ add_generation_prompt: true,
168
+ return_dict: true,
169
+ });
170
+ const streamer = new TextStreamer(tokenizer, {
171
+ skip_prompt: true,
172
+ skip_special_tokens: true,
173
+ callback_function: (text) => {
174
+ splitter.push(text);
175
+ },
176
+ token_callback_function: () => {},
177
+ });
178
+
179
+ stopping_criteria = new InterruptableStoppingCriteria();
180
+ const { past_key_values, sequences } = await llm.generate({
181
+ ...inputs,
182
+ past_key_values: past_key_values_cache,
183
+
184
+ do_sample: false, // TODO: do_sample: true is bugged (invalid data location on topk sample)
185
+ max_new_tokens: 1024,
186
+ streamer,
187
+ stopping_criteria,
188
+ return_dict_in_generate: true,
189
+ });
190
+ past_key_values_cache = past_key_values;
191
+
192
+ // Finally, close the stream to signal that no more text will be added.
193
+ splitter.close();
194
+
195
+ const decoded = tokenizer.batch_decode(
196
+ sequences.slice(null, [inputs.input_ids.dims[1], null]),
197
+ { skip_special_tokens: true },
198
+ );
199
+
200
+ messages.push({ role: "assistant", content: decoded[0] });
201
+ };
202
+
203
+ // Track the number of samples after the last speech chunk
204
+ let postSpeechSamples = 0;
205
+ const resetAfterRecording = (offset = 0) => {
206
+ self.postMessage({
207
+ type: "status",
208
+ status: "recording_end",
209
+ message: "Transcribing...",
210
+ duration: "until_next",
211
+ });
212
+ BUFFER.fill(0, offset);
213
+ bufferPointer = offset;
214
+ isRecording = false;
215
+ postSpeechSamples = 0;
216
+ };
217
+
218
+ const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
219
+ // Get start and end time of the speech segment, minus the padding
220
+ const now = Date.now();
221
+ const end =
222
+ now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
223
+ const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
224
+ const duration = end - start;
225
+ const overflowLength = overflow?.length ?? 0;
226
+
227
+ // Send the audio buffer to the worker
228
+ const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
229
+
230
+ const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
231
+ const paddedBuffer = new Float32Array(prevLength + buffer.length);
232
+ let offset = 0;
233
+ for (const prev of prevBuffers) {
234
+ paddedBuffer.set(prev, offset);
235
+ offset += prev.length;
236
+ }
237
+ paddedBuffer.set(buffer, offset);
238
+ speechToSpeech(paddedBuffer, { start, end, duration });
239
+
240
+ // Set overflow (if present) and reset the rest of the audio buffer
241
+ if (overflow) {
242
+ BUFFER.set(overflow, 0);
243
+ }
244
+ resetAfterRecording(overflowLength);
245
+ };
246
+
247
+ let prevBuffers = [];
248
+ self.onmessage = async (event) => {
249
+ const { type, buffer } = event.data;
250
+
251
+ // refuse new audio while playing back
252
+ if (type === "audio" && isPlaying) return;
253
+
254
+ switch (type) {
255
+ case "start_call": {
256
+ const name = tts.voices[voice ?? "af_heart"]?.name ?? "Heart";
257
+ greet(`Hey there, my name is ${name}! How can I help you today?`);
258
+ return;
259
+ }
260
+ case "end_call":
261
+ messages = [SYSTEM_MESSAGE];
262
+ past_key_values_cache = null;
263
+ case "interrupt":
264
+ stopping_criteria?.interrupt();
265
+ return;
266
+ case "set_voice":
267
+ voice = event.data.voice;
268
+ return;
269
+ case "playback_ended":
270
+ isPlaying = false;
271
+ return;
272
+ }
273
+
274
+ const wasRecording = isRecording; // Save current state
275
+ const isSpeech = await vad(buffer);
276
+
277
+ if (!wasRecording && !isSpeech) {
278
+ // We are not recording, and the buffer is not speech,
279
+ // so we will probably discard the buffer. So, we insert
280
+ // into a FIFO queue with maximum size of PREV_BUFFER_SIZE
281
+ if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
282
+ // If the queue is full, we discard the oldest buffer
283
+ prevBuffers.shift();
284
+ }
285
+ prevBuffers.push(buffer);
286
+ return;
287
+ }
288
+
289
+ const remaining = BUFFER.length - bufferPointer;
290
+ if (buffer.length >= remaining) {
291
+ // The buffer is larger than (or equal to) the remaining space in the global buffer,
292
+ // so we perform transcription and copy the overflow to the global buffer
293
+ BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
294
+ bufferPointer += remaining;
295
+
296
+ // Dispatch the audio buffer
297
+ const overflow = buffer.subarray(remaining);
298
+ dispatchForTranscriptionAndResetAudioBuffer(overflow);
299
+ return;
300
+ } else {
301
+ // The buffer is smaller than the remaining space in the global buffer,
302
+ // so we copy it to the global buffer
303
+ BUFFER.set(buffer, bufferPointer);
304
+ bufferPointer += buffer.length;
305
+ }
306
+
307
+ if (isSpeech) {
308
+ if (!isRecording) {
309
+ // Indicate start of recording
310
+ self.postMessage({
311
+ type: "status",
312
+ status: "recording_start",
313
+ message: "Listening...",
314
+ duration: "until_next",
315
+ });
316
+ }
317
+ // Start or continue recording
318
+ isRecording = true;
319
+ postSpeechSamples = 0; // Reset the post-speech samples
320
+ return;
321
+ }
322
+
323
+ postSpeechSamples += buffer.length;
324
+
325
+ // At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
326
+ // So, we check whether we have reached the end of the current audio chunk.
327
+ if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
328
+ // There was a short pause, but not long enough to consider the end of a speech chunk
329
+ // (e.g., the speaker took a breath), so we continue recording
330
+ return;
331
+ }
332
+
333
+ if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
334
+ // The entire buffer (including the new chunk) is smaller than the minimum
335
+ // duration of a speech chunk, so we can safely discard the buffer.
336
+ resetAfterRecording();
337
+ return;
338
+ }
339
+
340
+ dispatchForTranscriptionAndResetAudioBuffer();
341
+ };
342
+
343
+ function greet(text) {
344
+ isPlaying = true;
345
+ const splitter = new TextSplitterStream();
346
+ const stream = tts.stream(splitter, { voice });
347
+ (async () => {
348
+ for await (const { text: chunkText, audio } of stream) {
349
+ self.postMessage({ type: "output", text: chunkText, result: audio });
350
+ }
351
+ })();
352
+ splitter.push(text);
353
+ splitter.close();
354
+ messages.push({ role: "assistant", content: text });
355
+ }
vite.config.js ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from "vite";
2
+ import react from "@vitejs/plugin-react";
3
+ import tailwindcss from "@tailwindcss/vite";
4
+
5
+ // https://vite.dev/config/
6
+ export default defineConfig({
7
+ plugins: [tailwindcss(), react()],
8
+ build: {
9
+ target: "esnext",
10
+ },
11
+ worker: {
12
+ format: "es",
13
+ },
14
+ resolve: {
15
+ // Only bundle a single instance of Transformers.js
16
+ // (shared by `@huggingface/transformers` and `kokoro-js`)
17
+ dedupe: ["@huggingface/transformers"],
18
+ },
19
+ });