Xenova HF Staff commited on
Commit
022e8a2
·
verified ·
1 Parent(s): 74cbdd4
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ public/logo.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,81 +1,10 @@
1
  ---
2
- title: Speech To Speech Webgpu
3
- emoji: 🐠
4
  colorFrom: indigo
5
  colorTo: red
6
  sdk: static
7
  pinned: false
8
  app_build_command: npm run build
9
- app_file: build/index.html
10
  ---
11
-
12
- # Getting Started with Create React App
13
-
14
- This project was bootstrapped with [Create React App](https://github.com/facebook/create-react-app).
15
-
16
- ## Available Scripts
17
-
18
- In the project directory, you can run:
19
-
20
- ### `npm start`
21
-
22
- Runs the app in the development mode.\
23
- Open [http://localhost:3000](http://localhost:3000) to view it in your browser.
24
-
25
- The page will reload when you make changes.\
26
- You may also see any lint errors in the console.
27
-
28
- ### `npm test`
29
-
30
- Launches the test runner in the interactive watch mode.\
31
- See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) for more information.
32
-
33
- ### `npm run build`
34
-
35
- Builds the app for production to the `build` folder.\
36
- It correctly bundles React in production mode and optimizes the build for the best performance.
37
-
38
- The build is minified and the filenames include the hashes.\
39
- Your app is ready to be deployed!
40
-
41
- See the section about [deployment](https://facebook.github.io/create-react-app/docs/deployment) for more information.
42
-
43
- ### `npm run eject`
44
-
45
- **Note: this is a one-way operation. Once you `eject`, you can't go back!**
46
-
47
- If you aren't satisfied with the build tool and configuration choices, you can `eject` at any time. This command will remove the single build dependency from your project.
48
-
49
- Instead, it will copy all the configuration files and the transitive dependencies (webpack, Babel, ESLint, etc) right into your project so you have full control over them. All of the commands except `eject` will still work, but they will point to the copied scripts so you can tweak them. At this point you're on your own.
50
-
51
- You don't have to ever use `eject`. The curated feature set is suitable for small and middle deployments, and you shouldn't feel obligated to use this feature. However we understand that this tool wouldn't be useful if you couldn't customize it when you are ready for it.
52
-
53
- ## Learn More
54
-
55
- You can learn more in the [Create React App documentation](https://facebook.github.io/create-react-app/docs/getting-started).
56
-
57
- To learn React, check out the [React documentation](https://reactjs.org/).
58
-
59
- ### Code Splitting
60
-
61
- This section has moved here: [https://facebook.github.io/create-react-app/docs/code-splitting](https://facebook.github.io/create-react-app/docs/code-splitting)
62
-
63
- ### Analyzing the Bundle Size
64
-
65
- This section has moved here: [https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size](https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size)
66
-
67
- ### Making a Progressive Web App
68
-
69
- This section has moved here: [https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app](https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app)
70
-
71
- ### Advanced Configuration
72
-
73
- This section has moved here: [https://facebook.github.io/create-react-app/docs/advanced-configuration](https://facebook.github.io/create-react-app/docs/advanced-configuration)
74
-
75
- ### Deployment
76
-
77
- This section has moved here: [https://facebook.github.io/create-react-app/docs/deployment](https://facebook.github.io/create-react-app/docs/deployment)
78
-
79
- ### `npm run build` fails to minify
80
-
81
- This section has moved here: [https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify](https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify)
 
1
  ---
2
+ title: Speech To Speech WebGPU
3
+ emoji: 🗣️
4
  colorFrom: indigo
5
  colorTo: red
6
  sdk: static
7
  pinned: false
8
  app_build_command: npm run build
9
+ app_file: dist/index.html
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eslint.config.js ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import js from "@eslint/js";
2
+ import globals from "globals";
3
+ import reactHooks from "eslint-plugin-react-hooks";
4
+ import reactRefresh from "eslint-plugin-react-refresh";
5
+
6
+ export default [
7
+ { ignores: ["dist"] },
8
+ {
9
+ files: ["**/*.{js,jsx}"],
10
+ languageOptions: {
11
+ ecmaVersion: 2020,
12
+ globals: globals.browser,
13
+ parserOptions: {
14
+ ecmaVersion: "latest",
15
+ ecmaFeatures: { jsx: true },
16
+ sourceType: "module",
17
+ },
18
+ },
19
+ plugins: {
20
+ "react-hooks": reactHooks,
21
+ "react-refresh": reactRefresh,
22
+ },
23
+ rules: {
24
+ ...js.configs.recommended.rules,
25
+ ...reactHooks.configs.recommended.rules,
26
+ "no-unused-vars": ["error", { varsIgnorePattern: "^[A-Z_]" }],
27
+ "react-refresh/only-export-components": [
28
+ "warn",
29
+ { allowConstantExport: true },
30
+ ],
31
+ },
32
+ },
33
+ ];
index.html ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/png" href="/logo.png" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>Transformers.js | Speech-to-speech demo</title>
8
+ </head>
9
+ <body>
10
+ <div id="root"></div>
11
+ <script type="module" src="/src/main.jsx"></script>
12
+ </body>
13
+ </html>
package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
package.json CHANGED
@@ -1,39 +1,32 @@
1
  {
2
- "name": "react-template",
3
- "version": "0.1.0",
4
  "private": true,
5
- "dependencies": {
6
- "@testing-library/dom": "^10.4.0",
7
- "@testing-library/jest-dom": "^6.6.3",
8
- "@testing-library/react": "^16.3.0",
9
- "@testing-library/user-event": "^13.5.0",
10
- "react": "^19.1.0",
11
- "react-dom": "^19.1.0",
12
- "react-scripts": "5.0.1",
13
- "web-vitals": "^2.1.4"
14
- },
15
  "scripts": {
16
- "start": "react-scripts start",
17
- "build": "react-scripts build",
18
- "test": "react-scripts test",
19
- "eject": "react-scripts eject"
20
  },
21
- "eslintConfig": {
22
- "extends": [
23
- "react-app",
24
- "react-app/jest"
25
- ]
 
 
 
26
  },
27
- "browserslist": {
28
- "production": [
29
- ">0.2%",
30
- "not dead",
31
- "not op_mini all"
32
- ],
33
- "development": [
34
- "last 1 chrome version",
35
- "last 1 firefox version",
36
- "last 1 safari version"
37
- ]
38
  }
39
  }
 
1
  {
2
+ "name": "speech-to-speech",
 
3
  "private": true,
4
+ "version": "0.0.0",
5
+ "type": "module",
 
 
 
 
 
 
 
 
6
  "scripts": {
7
+ "dev": "vite",
8
+ "build": "vite build",
9
+ "lint": "eslint .",
10
+ "preview": "vite preview"
11
  },
12
+ "dependencies": {
13
+ "@huggingface/transformers": "^3.5.2",
14
+ "@tailwindcss/vite": "^4.1.4",
15
+ "kokoro-js": "^1.2.1",
16
+ "lucide-react": "^0.503.0",
17
+ "react": "^19.0.0",
18
+ "react-dom": "^19.0.0",
19
+ "tailwindcss": "^4.1.4"
20
  },
21
+ "devDependencies": {
22
+ "@eslint/js": "^9.22.0",
23
+ "@types/react": "^19.0.10",
24
+ "@types/react-dom": "^19.0.4",
25
+ "@vitejs/plugin-react": "^4.3.4",
26
+ "eslint": "^9.22.0",
27
+ "eslint-plugin-react-hooks": "^5.2.0",
28
+ "eslint-plugin-react-refresh": "^0.4.19",
29
+ "globals": "^16.0.0",
30
+ "vite": "^6.3.1"
 
31
  }
32
  }
public/favicon.ico DELETED
Binary file (3.87 kB)
 
public/index.html DELETED
@@ -1,43 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="utf-8" />
5
- <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
6
- <meta name="viewport" content="width=device-width, initial-scale=1" />
7
- <meta name="theme-color" content="#000000" />
8
- <meta
9
- name="description"
10
- content="Web site created using create-react-app"
11
- />
12
- <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
13
- <!--
14
- manifest.json provides metadata used when your web app is installed on a
15
- user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/
16
- -->
17
- <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
18
- <!--
19
- Notice the use of %PUBLIC_URL% in the tags above.
20
- It will be replaced with the URL of the `public` folder during the build.
21
- Only files inside the `public` folder can be referenced from the HTML.
22
-
23
- Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
24
- work correctly both with client-side routing and a non-root public URL.
25
- Learn how to configure a non-root public URL by running `npm run build`.
26
- -->
27
- <title>React App</title>
28
- </head>
29
- <body>
30
- <noscript>You need to enable JavaScript to run this app.</noscript>
31
- <div id="root"></div>
32
- <!--
33
- This HTML file is a template.
34
- If you open it directly in the browser, you will see an empty page.
35
-
36
- You can add webfonts, meta tags, or analytics to this file.
37
- The build step will place the bundled scripts into the <body> tag.
38
-
39
- To begin the development, run `npm start` or `yarn start`.
40
- To create a production bundle, use `npm run build` or `yarn build`.
41
- -->
42
- </body>
43
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
public/logo.png ADDED

Git LFS Details

  • SHA256: 9fb2bd90d1eeab88414681bb80464bc723ab57fb2c5b2f33367f16a0157ed5c0
  • Pointer size: 131 Bytes
  • Size of remote file: 634 kB
public/logo192.png DELETED
Binary file (5.35 kB)
 
public/logo512.png DELETED
Binary file (9.66 kB)
 
public/manifest.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "short_name": "React App",
3
- "name": "Create React App Sample",
4
- "icons": [
5
- {
6
- "src": "favicon.ico",
7
- "sizes": "64x64 32x32 24x24 16x16",
8
- "type": "image/x-icon"
9
- },
10
- {
11
- "src": "logo192.png",
12
- "type": "image/png",
13
- "sizes": "192x192"
14
- },
15
- {
16
- "src": "logo512.png",
17
- "type": "image/png",
18
- "sizes": "512x512"
19
- }
20
- ],
21
- "start_url": ".",
22
- "display": "standalone",
23
- "theme_color": "#000000",
24
- "background_color": "#ffffff"
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
public/robots.txt DELETED
@@ -1,3 +0,0 @@
1
- # https://www.robotstxt.org/robotstxt.html
2
- User-agent: *
3
- Disallow:
 
 
 
 
src/App.css DELETED
@@ -1,38 +0,0 @@
1
- .App {
2
- text-align: center;
3
- }
4
-
5
- .App-logo {
6
- height: 40vmin;
7
- pointer-events: none;
8
- }
9
-
10
- @media (prefers-reduced-motion: no-preference) {
11
- .App-logo {
12
- animation: App-logo-spin infinite 20s linear;
13
- }
14
- }
15
-
16
- .App-header {
17
- background-color: #282c34;
18
- min-height: 100vh;
19
- display: flex;
20
- flex-direction: column;
21
- align-items: center;
22
- justify-content: center;
23
- font-size: calc(10px + 2vmin);
24
- color: white;
25
- }
26
-
27
- .App-link {
28
- color: #61dafb;
29
- }
30
-
31
- @keyframes App-logo-spin {
32
- from {
33
- transform: rotate(0deg);
34
- }
35
- to {
36
- transform: rotate(360deg);
37
- }
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/App.js DELETED
@@ -1,25 +0,0 @@
1
- import logo from './logo.svg';
2
- import './App.css';
3
-
4
- function App() {
5
- return (
6
- <div className="App">
7
- <header className="App-header">
8
- <img src={logo} className="App-logo" alt="logo" />
9
- <p>
10
- Edit <code>src/App.js</code> and save to reload.
11
- </p>
12
- <a
13
- className="App-link"
14
- href="https://reactjs.org"
15
- target="_blank"
16
- rel="noopener noreferrer"
17
- >
18
- Learn React
19
- </a>
20
- </header>
21
- </div>
22
- );
23
- }
24
-
25
- export default App;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/App.jsx ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect, useState, useRef } from "react";
2
+ import { Mic, PhoneOff, ChevronDown } from "lucide-react";
3
+ import { INPUT_SAMPLE_RATE } from "./constants";
4
+
5
+ import WORKLET from "./play-worklet.js";
6
+
7
+ export default function App() {
8
+ const [callStartTime, setCallStartTime] = useState(null);
9
+ const [callStarted, setCallStarted] = useState(false);
10
+ const [playing, setPlaying] = useState(false);
11
+
12
+ const [voice, setVoice] = useState("af_heart");
13
+ const [voices, setVoices] = useState([]);
14
+
15
+ const [isListening, setIsListening] = useState(false);
16
+ const [isSpeaking, setIsSpeaking] = useState(false);
17
+ const [listeningScale, setListeningScale] = useState(1);
18
+ const [speakingScale, setSpeakingScale] = useState(1);
19
+ const [ripples, setRipples] = useState([]);
20
+
21
+ const [ready, setReady] = useState(false);
22
+ const [error, setError] = useState(null);
23
+ const [elapsedTime, setElapsedTime] = useState("00:00");
24
+ const worker = useRef(null);
25
+
26
+ const node = useRef(null);
27
+
28
+ useEffect(() => {
29
+ worker.current?.postMessage({
30
+ type: "set_voice",
31
+ voice,
32
+ });
33
+ }, [voice]);
34
+
35
+ useEffect(() => {
36
+ if (!callStarted) {
37
+ // Reset worker state after call ends
38
+ worker.current?.postMessage({
39
+ type: "end_call",
40
+ });
41
+ }
42
+ }, [callStarted]);
43
+
44
+ useEffect(() => {
45
+ if (callStarted && callStartTime) {
46
+ const interval = setInterval(() => {
47
+ const diff = Math.floor((Date.now() - callStartTime) / 1000);
48
+ const minutes = String(Math.floor(diff / 60)).padStart(2, "0");
49
+ const seconds = String(diff % 60).padStart(2, "0");
50
+ setElapsedTime(`${minutes}:${seconds}`);
51
+ }, 1000);
52
+ return () => clearInterval(interval);
53
+ } else {
54
+ setElapsedTime("00:00");
55
+ }
56
+ }, [callStarted, callStartTime]);
57
+
58
+ useEffect(() => {
59
+ worker.current ??= new Worker(new URL("./worker.js", import.meta.url), {
60
+ type: "module",
61
+ });
62
+
63
+ const onMessage = ({ data }) => {
64
+ console.log("Worker message:", data);
65
+ if (data.error) {
66
+ return onError(data.error);
67
+ }
68
+
69
+ switch (data.type) {
70
+ case "status":
71
+ if (data.status === "recording_start") {
72
+ setIsListening(true);
73
+ setIsSpeaking(false);
74
+ } else if (data.status === "recording_end") {
75
+ setIsListening(false);
76
+ } else if (data.status === "ready") {
77
+ setVoices(data.voices);
78
+ setReady(true);
79
+ }
80
+ break;
81
+ case "output":
82
+ if (!playing) {
83
+ node.current?.port.postMessage(data.result.audio);
84
+ setPlaying(true);
85
+ setIsSpeaking(true);
86
+ setIsListening(false);
87
+ }
88
+ break;
89
+ }
90
+ };
91
+ const onError = (err) => setError(err.message);
92
+
93
+ worker.current.addEventListener("message", onMessage);
94
+ worker.current.addEventListener("error", onError);
95
+
96
+ return () => {
97
+ worker.current.removeEventListener("message", onMessage);
98
+ worker.current.removeEventListener("error", onError);
99
+ };
100
+ }, []);
101
+
102
+ useEffect(() => {
103
+ if (!callStarted) return;
104
+
105
+ let worklet;
106
+ let inputAudioContext;
107
+ let source;
108
+ let ignore = false;
109
+
110
+ let outputAudioContext;
111
+ const audioStreamPromise = navigator.mediaDevices.getUserMedia({
112
+ audio: {
113
+ channelCount: 1,
114
+ echoCancellation: true,
115
+ autoGainControl: true,
116
+ noiseSuppression: true,
117
+ sampleRate: INPUT_SAMPLE_RATE,
118
+ },
119
+ });
120
+
121
+ audioStreamPromise
122
+ .then(async (stream) => {
123
+ if (ignore) return;
124
+
125
+ inputAudioContext = new (window.AudioContext ||
126
+ window.webkitAudioContext)({
127
+ sampleRate: INPUT_SAMPLE_RATE,
128
+ });
129
+
130
+ const analyser = inputAudioContext.createAnalyser();
131
+ analyser.fftSize = 256;
132
+ source = inputAudioContext.createMediaStreamSource(stream);
133
+ source.connect(analyser);
134
+
135
+ const inputDataArray = new Uint8Array(analyser.frequencyBinCount);
136
+
137
+ function calculateRMS(array) {
138
+ let sum = 0;
139
+ for (let i = 0; i < array.length; ++i) {
140
+ const normalized = array[i] / 128 - 1;
141
+ sum += normalized * normalized;
142
+ }
143
+ const rms = Math.sqrt(sum / array.length);
144
+ return rms;
145
+ }
146
+
147
+ await inputAudioContext.audioWorklet.addModule(
148
+ new URL("./vad-processor.js", import.meta.url),
149
+ );
150
+ worklet = new AudioWorkletNode(inputAudioContext, "vad-processor", {
151
+ numberOfInputs: 1,
152
+ numberOfOutputs: 0,
153
+ channelCount: 1,
154
+ channelCountMode: "explicit",
155
+ channelInterpretation: "discrete",
156
+ });
157
+
158
+ source.connect(worklet);
159
+ worklet.port.onmessage = (event) => {
160
+ const { buffer } = event.data;
161
+ worker.current?.postMessage({ type: "audio", buffer });
162
+ };
163
+
164
+ outputAudioContext = new AudioContext({
165
+ sampleRate: 24000,
166
+ });
167
+ outputAudioContext.resume();
168
+
169
+ const blob = new Blob([`(${WORKLET.toString()})()`], {
170
+ type: "application/javascript",
171
+ });
172
+ const url = URL.createObjectURL(blob);
173
+ await outputAudioContext.audioWorklet.addModule(url);
174
+ URL.revokeObjectURL(url);
175
+
176
+ node.current = new AudioWorkletNode(
177
+ outputAudioContext,
178
+ "buffered-audio-worklet-processor",
179
+ );
180
+
181
+ node.current.port.onmessage = (event) => {
182
+ if (event.data.type === "playback_ended") {
183
+ setPlaying(false);
184
+ setIsSpeaking(false);
185
+ worker.current?.postMessage({ type: "playback_ended" });
186
+ }
187
+ };
188
+
189
+ const outputAnalyser = outputAudioContext.createAnalyser();
190
+ outputAnalyser.fftSize = 256;
191
+
192
+ node.current.connect(outputAnalyser);
193
+ outputAnalyser.connect(outputAudioContext.destination);
194
+
195
+ const outputDataArray = new Uint8Array(
196
+ outputAnalyser.frequencyBinCount,
197
+ );
198
+
199
+ function updateVisualizers() {
200
+ analyser.getByteTimeDomainData(inputDataArray);
201
+ const rms = calculateRMS(inputDataArray);
202
+ const targetScale = 1 + Math.min(1.25 * rms, 0.25);
203
+ setListeningScale((prev) => prev + (targetScale - prev) * 0.25);
204
+
205
+ outputAnalyser.getByteTimeDomainData(outputDataArray);
206
+ const outputRMS = calculateRMS(outputDataArray);
207
+ const targetOutputScale = 1 + Math.min(1.25 * outputRMS, 0.25);
208
+ setSpeakingScale((prev) => prev + (targetOutputScale - prev) * 0.25);
209
+
210
+ requestAnimationFrame(updateVisualizers);
211
+ }
212
+ updateVisualizers();
213
+ })
214
+ .catch((err) => {
215
+ setError(err.message);
216
+ console.error(err);
217
+ });
218
+
219
+ return () => {
220
+ ignore = true;
221
+
222
+ audioStreamPromise.then((stream) =>
223
+ stream.getTracks().forEach((track) => track.stop()),
224
+ );
225
+ source?.disconnect();
226
+ worklet?.disconnect();
227
+ inputAudioContext?.close();
228
+
229
+ outputAudioContext?.close();
230
+ };
231
+ }, [callStarted]);
232
+
233
+ useEffect(() => {
234
+ if (!callStarted) return;
235
+ const interval = setInterval(() => {
236
+ const id = Date.now();
237
+ setRipples((prev) => [...prev, id]);
238
+ setTimeout(() => {
239
+ setRipples((prev) => prev.filter((r) => r !== id));
240
+ }, 1500);
241
+ }, 1000);
242
+ return () => clearInterval(interval);
243
+ }, [callStarted]);
244
+
245
+ return (
246
+ <div className="h-screen min-h-[240px] flex items-center justify-center bg-gray-50 p-4 relative">
247
+ <div className="h-full max-h-[320px] w-[640px] bg-white rounded-xl shadow-lg p-8 flex items-center justify-between space-x-16">
248
+ <div className="text-green-700 w-[140px]">
249
+ <div className="text-xl font-bold flex justify-between">
250
+ {voices?.[voice]?.name}
251
+ <span className="font-normal text-gray-500">{elapsedTime}</span>
252
+ </div>
253
+ <div className="text-base relative">
254
+ <button
255
+ type="button"
256
+ disabled={!ready}
257
+ className={`w-full flex items-center justify-between border border-gray-300 rounded-md transition-colors ${
258
+ ready
259
+ ? "bg-transparent hover:border-gray-400"
260
+ : "bg-gray-100 opacity-50 cursor-not-allowed"
261
+ }`}
262
+ >
263
+ <span className="px-2 py-1">Select voice</span>
264
+ <ChevronDown className="absolute right-2" />
265
+ </button>
266
+ <select
267
+ value={voice}
268
+ onChange={(e) => setVoice(e.target.value)}
269
+ className="absolute inset-0 opacity-0 cursor-pointer"
270
+ disabled={!ready}
271
+ >
272
+ {Object.entries(voices).map(([key, v]) => (
273
+ <option key={key} value={key}>
274
+ {`${v.name} (${
275
+ v.language === "en-us" ? "American" : v.language
276
+ } ${v.gender})`}
277
+ </option>
278
+ ))}
279
+ </select>
280
+ </div>
281
+ </div>
282
+
283
+ <div className="relative flex items-center justify-center w-32 h-32 flex-shrink-0 aspect-square">
284
+ {callStarted &&
285
+ ripples.map((id) => (
286
+ <div
287
+ key={id}
288
+ className="absolute inset-0 rounded-full border-2 border-green-200 pointer-events-none"
289
+ style={{ animation: "ripple 1.5s ease-out forwards" }}
290
+ />
291
+ ))}
292
+ <div className="absolute z-10 text-lg text-gray-700">
293
+ {!ready ? "Loading..." : ""}
294
+ {isListening && "Listening..."}
295
+ {isSpeaking && "Speaking..."}
296
+ </div>
297
+ {/* Pulsing loader while initializing */}
298
+ <div
299
+ className={`absolute w-32 h-32 rounded-full bg-green-200 ${
300
+ !ready ? "animate-ping opacity-75" : ""
301
+ }`}
302
+ style={{ animationDuration: "1.5s" }}
303
+ />
304
+ {/* Main rings */}
305
+ <div
306
+ className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-300 ${
307
+ !ready ? "opacity-0" : ""
308
+ }`}
309
+ style={{ transform: `scale(${speakingScale})` }}
310
+ />
311
+ <div
312
+ className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-200 ${
313
+ !ready ? "opacity-0" : ""
314
+ }`}
315
+ style={{ transform: `scale(${listeningScale})` }}
316
+ />
317
+ </div>
318
+
319
+ <div className="space-y-4 w-[140px]">
320
+ {callStarted ? (
321
+ <button
322
+ className="flex items-center space-x-2 px-4 py-2 bg-red-100 text-red-700 rounded-md hover:bg-red-200"
323
+ onClick={() => {
324
+ setCallStarted(false);
325
+ setCallStartTime(null);
326
+ setPlaying(false);
327
+ setIsListening(false);
328
+ setIsSpeaking(false);
329
+ }}
330
+ >
331
+ <PhoneOff className="w-5 h-5" />
332
+ <span>End call</span>
333
+ </button>
334
+ ) : (
335
+ <button
336
+ className={`flex items-center space-x-2 px-4 py-2 rounded-md ${
337
+ ready
338
+ ? "bg-blue-100 text-blue-700 hover:bg-blue-200"
339
+ : "bg-blue-100 text-blue-700 opacity-50 cursor-not-allowed"
340
+ }`}
341
+ onClick={() => {
342
+ setCallStartTime(Date.now());
343
+ setCallStarted(true);
344
+ worker.current?.postMessage({ type: "start_call" });
345
+ }}
346
+ disabled={!ready}
347
+ >
348
+ <span>Start call</span>
349
+ </button>
350
+ )}
351
+ </div>
352
+ </div>
353
+
354
+ <div className="absolute bottom-4 text-sm">
355
+ Built with{" "}
356
+ <a
357
+ href="https://github.com/huggingface/transformers.js"
358
+ rel="noopener noreferrer"
359
+ target="_blank"
360
+ className="text-blue-600 hover:underline"
361
+ >
362
+ 🤗 Transformers.js
363
+ </a>
364
+ </div>
365
+ </div>
366
+ );
367
+ }
src/App.test.js DELETED
@@ -1,8 +0,0 @@
1
- import { render, screen } from '@testing-library/react';
2
- import App from './App';
3
-
4
- test('renders learn react link', () => {
5
- render(<App />);
6
- const linkElement = screen.getByText(/learn react/i);
7
- expect(linkElement).toBeInTheDocument();
8
- });
 
 
 
 
 
 
 
 
 
src/constants.js ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Sample rate of the input audio.
3
+ * Coindicentally, this is the same for both models (Moonshine and Silero VAD)
4
+ */
5
+ export const INPUT_SAMPLE_RATE = 16000;
6
+ const INPUT_SAMPLE_RATE_MS = INPUT_SAMPLE_RATE / 1000;
7
+
8
+ /**
9
+ * Probabilities ABOVE this value are considered as SPEECH
10
+ */
11
+ export const SPEECH_THRESHOLD = 0.3;
12
+
13
+ /**
14
+ * If current state is SPEECH, and the probability of the next state
15
+ * is below this value, it is considered as NON-SPEECH.
16
+ */
17
+ export const EXIT_THRESHOLD = 0.1;
18
+
19
+ /**
20
+ * After each speech chunk, wait for at least this amount of silence
21
+ * before considering the next chunk as a new speech chunk
22
+ */
23
+ export const MIN_SILENCE_DURATION_MS = 400;
24
+ export const MIN_SILENCE_DURATION_SAMPLES =
25
+ MIN_SILENCE_DURATION_MS * INPUT_SAMPLE_RATE_MS;
26
+
27
+ /**
28
+ * Pad the speech chunk with this amount each side
29
+ */
30
+ export const SPEECH_PAD_MS = 80;
31
+ export const SPEECH_PAD_SAMPLES = SPEECH_PAD_MS * INPUT_SAMPLE_RATE_MS;
32
+
33
+ /**
34
+ * Final speech chunks below this duration are discarded
35
+ */
36
+ export const MIN_SPEECH_DURATION_SAMPLES = 250 * INPUT_SAMPLE_RATE_MS; // 250 ms
37
+
38
+ /**
39
+ * Maximum duration of audio that can be handled by Moonshine
40
+ */
41
+ export const MAX_BUFFER_DURATION = 30;
42
+
43
+ /**
44
+ * Size of the incoming buffers
45
+ */
46
+ export const NEW_BUFFER_SIZE = 512;
47
+
48
+ /**
49
+ * The number of previous buffers to keep, to ensure the audio is padded correctly
50
+ */
51
+ export const MAX_NUM_PREV_BUFFERS = Math.ceil(
52
+ SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE,
53
+ );
src/index.css CHANGED
@@ -1,13 +1,12 @@
1
- body {
2
- margin: 0;
3
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
4
- 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
5
- sans-serif;
6
- -webkit-font-smoothing: antialiased;
7
- -moz-osx-font-smoothing: grayscale;
8
- }
9
 
10
- code {
11
- font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12
- monospace;
 
 
 
 
 
 
13
  }
 
1
+ @import "tailwindcss";
 
 
 
 
 
 
 
2
 
3
+ @keyframes ripple {
4
+ from {
5
+ transform: scale(1);
6
+ opacity: 0.7;
7
+ }
8
+ to {
9
+ transform: scale(2);
10
+ opacity: 0;
11
+ }
12
  }
src/index.js DELETED
@@ -1,17 +0,0 @@
1
- import React from 'react';
2
- import ReactDOM from 'react-dom/client';
3
- import './index.css';
4
- import App from './App';
5
- import reportWebVitals from './reportWebVitals';
6
-
7
- const root = ReactDOM.createRoot(document.getElementById('root'));
8
- root.render(
9
- <React.StrictMode>
10
- <App />
11
- </React.StrictMode>
12
- );
13
-
14
- // If you want to start measuring performance in your app, pass a function
15
- // to log results (for example: reportWebVitals(console.log))
16
- // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
17
- reportWebVitals();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/logo.svg DELETED
src/main.jsx ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import { StrictMode } from "react";
2
+ import { createRoot } from "react-dom/client";
3
+ import "./index.css";
4
+ import App from "./App.jsx";
5
+
6
+ createRoot(document.getElementById("root")).render(
7
+ <StrictMode>
8
+ <App />
9
+ </StrictMode>,
10
+ );
src/play-worklet.js ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export default () => {
2
+ class BufferedAudioWorkletProcessor extends AudioWorkletProcessor {
3
+ constructor() {
4
+ super();
5
+ this.bufferQueue = [];
6
+ this.currentChunkOffset = 0;
7
+ this.hadData = false;
8
+
9
+ this.port.onmessage = (event) => {
10
+ const data = event.data;
11
+ if (data instanceof Float32Array) {
12
+ this.hadData = true;
13
+ this.bufferQueue.push(data);
14
+ } else if (data === "stop") {
15
+ this.bufferQueue = [];
16
+ this.currentChunkOffset = 0;
17
+ }
18
+ };
19
+ }
20
+
21
+ process(inputs, outputs) {
22
+ const channel = outputs[0][0];
23
+ if (!channel) return true;
24
+
25
+ const numSamples = channel.length;
26
+ let outputIndex = 0;
27
+
28
+ if (this.hadData && this.bufferQueue.length === 0) {
29
+ this.port.postMessage({ type: "playback_ended" });
30
+ this.hadData = false;
31
+ }
32
+
33
+ while (outputIndex < numSamples) {
34
+ if (this.bufferQueue.length > 0) {
35
+ const currentChunk = this.bufferQueue[0];
36
+ const remainingSamples =
37
+ currentChunk.length - this.currentChunkOffset;
38
+ const samplesToCopy = Math.min(
39
+ remainingSamples,
40
+ numSamples - outputIndex,
41
+ );
42
+
43
+ channel.set(
44
+ currentChunk.subarray(
45
+ this.currentChunkOffset,
46
+ this.currentChunkOffset + samplesToCopy,
47
+ ),
48
+ outputIndex,
49
+ );
50
+
51
+ this.currentChunkOffset += samplesToCopy;
52
+ outputIndex += samplesToCopy;
53
+
54
+ // Remove the chunk if fully consumed.
55
+ if (this.currentChunkOffset >= currentChunk.length) {
56
+ this.bufferQueue.shift();
57
+ this.currentChunkOffset = 0;
58
+ }
59
+ } else {
60
+ // If no data is available, fill the rest of the buffer with silence.
61
+ channel.fill(0, outputIndex);
62
+ outputIndex = numSamples;
63
+ }
64
+ }
65
+ return true;
66
+ }
67
+ }
68
+
69
+ registerProcessor(
70
+ "buffered-audio-worklet-processor",
71
+ BufferedAudioWorkletProcessor,
72
+ );
73
+ };
src/reportWebVitals.js DELETED
@@ -1,13 +0,0 @@
1
- const reportWebVitals = onPerfEntry => {
2
- if (onPerfEntry && onPerfEntry instanceof Function) {
3
- import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
4
- getCLS(onPerfEntry);
5
- getFID(onPerfEntry);
6
- getFCP(onPerfEntry);
7
- getLCP(onPerfEntry);
8
- getTTFB(onPerfEntry);
9
- });
10
- }
11
- };
12
-
13
- export default reportWebVitals;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/setupTests.js DELETED
@@ -1,5 +0,0 @@
1
- // jest-dom adds custom jest matchers for asserting on DOM nodes.
2
- // allows you to do things like:
3
- // expect(element).toHaveTextContent(/react/i)
4
- // learn more: https://github.com/testing-library/jest-dom
5
- import '@testing-library/jest-dom';
 
 
 
 
 
 
src/vad-processor.js ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const MIN_CHUNK_SIZE = 512;
2
+ let globalPointer = 0;
3
+ let globalBuffer = new Float32Array(MIN_CHUNK_SIZE);
4
+
5
+ class VADProcessor extends AudioWorkletProcessor {
6
+ process(inputs, outputs, parameters) {
7
+ const buffer = inputs[0][0];
8
+ if (!buffer) return; // buffer is null when the stream ends
9
+
10
+ if (buffer.length > MIN_CHUNK_SIZE) {
11
+ // If the buffer is larger than the minimum chunk size, send the entire buffer
12
+ this.port.postMessage({ buffer });
13
+ } else {
14
+ const remaining = MIN_CHUNK_SIZE - globalPointer;
15
+ if (buffer.length >= remaining) {
16
+ // If the buffer is larger than (or equal to) the remaining space in the global buffer, copy the remaining space
17
+ globalBuffer.set(buffer.subarray(0, remaining), globalPointer);
18
+
19
+ // Send the global buffer
20
+ this.port.postMessage({ buffer: globalBuffer });
21
+
22
+ // Reset the global buffer and set the remaining buffer
23
+ globalBuffer.fill(0);
24
+ globalBuffer.set(buffer.subarray(remaining), 0);
25
+ globalPointer = buffer.length - remaining;
26
+ } else {
27
+ // If the buffer is smaller than the remaining space in the global buffer, copy the buffer to the global buffer
28
+ globalBuffer.set(buffer, globalPointer);
29
+ globalPointer += buffer.length;
30
+ }
31
+ }
32
+
33
+ return true; // Keep the processor alive
34
+ }
35
+ }
36
+
37
+ registerProcessor("vad-processor", VADProcessor);
src/worker.js ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ // VAD
3
+ AutoModel,
4
+
5
+ // LLM
6
+ AutoTokenizer,
7
+ AutoModelForCausalLM,
8
+ TextStreamer,
9
+ InterruptableStoppingCriteria,
10
+
11
+ // Speech recognition
12
+ Tensor,
13
+ pipeline,
14
+ } from "@huggingface/transformers";
15
+
16
+ import { KokoroTTS, TextSplitterStream } from "kokoro-js";
17
+
18
+ import {
19
+ MAX_BUFFER_DURATION,
20
+ INPUT_SAMPLE_RATE,
21
+ SPEECH_THRESHOLD,
22
+ EXIT_THRESHOLD,
23
+ SPEECH_PAD_SAMPLES,
24
+ MAX_NUM_PREV_BUFFERS,
25
+ MIN_SILENCE_DURATION_SAMPLES,
26
+ MIN_SPEECH_DURATION_SAMPLES,
27
+ } from "./constants";
28
+
29
+ const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
30
+ let voice;
31
+ const tts = await KokoroTTS.from_pretrained(model_id, {
32
+ dtype: "fp32",
33
+ device: "webgpu",
34
+ });
35
+
36
+ const device = "webgpu";
37
+ self.postMessage({ type: "info", message: `Using device: "${device}"` });
38
+ self.postMessage({
39
+ type: "info",
40
+ message: "Loading models...",
41
+ duration: "until_next",
42
+ });
43
+
44
+ // Load models
45
+ const silero_vad = await AutoModel.from_pretrained(
46
+ "onnx-community/silero-vad",
47
+ {
48
+ config: { model_type: "custom" },
49
+ dtype: "fp32", // Full-precision
50
+ },
51
+ ).catch((error) => {
52
+ self.postMessage({ error });
53
+ throw error;
54
+ });
55
+
56
+ const DEVICE_DTYPE_CONFIGS = {
57
+ webgpu: {
58
+ encoder_model: "fp32",
59
+ decoder_model_merged: "fp32",
60
+ },
61
+ wasm: {
62
+ encoder_model: "fp32",
63
+ decoder_model_merged: "q8",
64
+ },
65
+ };
66
+ const transcriber = await pipeline(
67
+ "automatic-speech-recognition",
68
+ "onnx-community/whisper-base", // or "onnx-community/moonshine-base-ONNX",
69
+ {
70
+ device,
71
+ dtype: DEVICE_DTYPE_CONFIGS[device],
72
+ },
73
+ ).catch((error) => {
74
+ self.postMessage({ error });
75
+ throw error;
76
+ });
77
+
78
+ await transcriber(new Float32Array(INPUT_SAMPLE_RATE)); // Compile shaders
79
+
80
+ const llm_model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct";
81
+ const tokenizer = await AutoTokenizer.from_pretrained(llm_model_id);
82
+ const llm = await AutoModelForCausalLM.from_pretrained(llm_model_id, {
83
+ dtype: "q4f16",
84
+ device: "webgpu",
85
+ });
86
+
87
+ const SYSTEM_MESSAGE = {
88
+ role: "system",
89
+ content:
90
+ "You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual.",
91
+ };
92
+ await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 }); // Compile shaders
93
+
94
+ let messages = [SYSTEM_MESSAGE];
95
+ let past_key_values_cache;
96
+ let stopping_criteria;
97
+ self.postMessage({
98
+ type: "status",
99
+ status: "ready",
100
+ message: "Ready!",
101
+ voices: tts.voices,
102
+ });
103
+
104
+ // Global audio buffer to store incoming audio
105
+ const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
106
+ let bufferPointer = 0;
107
+
108
+ // Initial state for VAD
109
+ const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
110
+ let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
111
+
112
+ // Whether we are in the process of adding audio to the buffer
113
+ let isRecording = false;
114
+ let isPlaying = false; // new flag
115
+
116
+ /**
117
+ * Perform Voice Activity Detection (VAD)
118
+ * @param {Float32Array} buffer The new audio buffer
119
+ * @returns {Promise<boolean>} `true` if the buffer is speech, `false` otherwise.
120
+ */
121
+ async function vad(buffer) {
122
+ const input = new Tensor("float32", buffer, [1, buffer.length]);
123
+
124
+ const { stateN, output } = await silero_vad({ input, sr, state });
125
+ state = stateN; // Update state
126
+
127
+ const isSpeech = output.data[0];
128
+
129
+ // Use heuristics to determine if the buffer is speech or not
130
+ return (
131
+ // Case 1: We are above the threshold (definitely speech)
132
+ isSpeech > SPEECH_THRESHOLD ||
133
+ // Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
134
+ (isRecording && isSpeech >= EXIT_THRESHOLD)
135
+ );
136
+ }
137
+
138
+ /**
139
+ * Transcribe the audio buffer
140
+ * @param {Float32Array} buffer The audio buffer
141
+ * @param {Object} data Additional data
142
+ */
143
+ const speechToSpeech = async (buffer, data) => {
144
+ isPlaying = true;
145
+
146
+ // 1. Transcribe the audio from the user
147
+ const text = await transcriber(buffer).then(({ text }) => text.trim());
148
+ if (["", "[BLANK_AUDIO]"].includes(text)) {
149
+ // If the transcription is empty or a blank audio, we skip the rest of the processing
150
+ return;
151
+ }
152
+ messages.push({ role: "user", content: text });
153
+
154
+ // Set up text-to-speech streaming
155
+ const splitter = new TextSplitterStream();
156
+ const stream = tts.stream(splitter, {
157
+ voice,
158
+ });
159
+ (async () => {
160
+ for await (const { text, phonemes, audio } of stream) {
161
+ self.postMessage({ type: "output", text, result: audio });
162
+ }
163
+ })();
164
+
165
+ // 2. Generate a response using the LLM
166
+ const inputs = tokenizer.apply_chat_template(messages, {
167
+ add_generation_prompt: true,
168
+ return_dict: true,
169
+ });
170
+ const streamer = new TextStreamer(tokenizer, {
171
+ skip_prompt: true,
172
+ skip_special_tokens: true,
173
+ callback_function: (text) => {
174
+ splitter.push(text);
175
+ },
176
+ token_callback_function: () => {},
177
+ });
178
+
179
+ stopping_criteria = new InterruptableStoppingCriteria();
180
+ const { past_key_values, sequences } = await llm.generate({
181
+ ...inputs,
182
+ past_key_values: past_key_values_cache,
183
+
184
+ do_sample: false, // TODO: do_sample: true is bugged (invalid data location on topk sample)
185
+ max_new_tokens: 1024,
186
+ streamer,
187
+ stopping_criteria,
188
+ return_dict_in_generate: true,
189
+ });
190
+ past_key_values_cache = past_key_values;
191
+
192
+ // Finally, close the stream to signal that no more text will be added.
193
+ splitter.close();
194
+
195
+ const decoded = tokenizer.batch_decode(
196
+ sequences.slice(null, [inputs.input_ids.dims[1], null]),
197
+ { skip_special_tokens: true },
198
+ );
199
+
200
+ messages.push({ role: "assistant", content: decoded[0] });
201
+ };
202
+
203
+ // Track the number of samples after the last speech chunk
204
+ let postSpeechSamples = 0;
205
+ const resetAfterRecording = (offset = 0) => {
206
+ self.postMessage({
207
+ type: "status",
208
+ status: "recording_end",
209
+ message: "Transcribing...",
210
+ duration: "until_next",
211
+ });
212
+ BUFFER.fill(0, offset);
213
+ bufferPointer = offset;
214
+ isRecording = false;
215
+ postSpeechSamples = 0;
216
+ };
217
+
218
+ const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
219
+ // Get start and end time of the speech segment, minus the padding
220
+ const now = Date.now();
221
+ const end =
222
+ now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
223
+ const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
224
+ const duration = end - start;
225
+ const overflowLength = overflow?.length ?? 0;
226
+
227
+ // Send the audio buffer to the worker
228
+ const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
229
+
230
+ const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
231
+ const paddedBuffer = new Float32Array(prevLength + buffer.length);
232
+ let offset = 0;
233
+ for (const prev of prevBuffers) {
234
+ paddedBuffer.set(prev, offset);
235
+ offset += prev.length;
236
+ }
237
+ paddedBuffer.set(buffer, offset);
238
+ speechToSpeech(paddedBuffer, { start, end, duration });
239
+
240
+ // Set overflow (if present) and reset the rest of the audio buffer
241
+ if (overflow) {
242
+ BUFFER.set(overflow, 0);
243
+ }
244
+ resetAfterRecording(overflowLength);
245
+ };
246
+
247
+ let prevBuffers = [];
248
+ self.onmessage = async (event) => {
249
+ const { type, buffer } = event.data;
250
+
251
+ // refuse new audio while playing back
252
+ if (type === "audio" && isPlaying) return;
253
+
254
+ switch (type) {
255
+ case "start_call": {
256
+ const name = tts.voices[voice ?? "af_heart"]?.name ?? "Heart";
257
+ greet(`Hey there, my name is ${name}! How can I help you today?`);
258
+ return;
259
+ }
260
+ case "end_call":
261
+ messages = [SYSTEM_MESSAGE];
262
+ past_key_values_cache = null;
263
+ case "interrupt":
264
+ stopping_criteria?.interrupt();
265
+ return;
266
+ case "set_voice":
267
+ voice = event.data.voice;
268
+ return;
269
+ case "playback_ended":
270
+ isPlaying = false;
271
+ return;
272
+ }
273
+
274
+ const wasRecording = isRecording; // Save current state
275
+ const isSpeech = await vad(buffer);
276
+
277
+ if (!wasRecording && !isSpeech) {
278
+ // We are not recording, and the buffer is not speech,
279
+ // so we will probably discard the buffer. So, we insert
280
+ // into a FIFO queue with maximum size of PREV_BUFFER_SIZE
281
+ if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
282
+ // If the queue is full, we discard the oldest buffer
283
+ prevBuffers.shift();
284
+ }
285
+ prevBuffers.push(buffer);
286
+ return;
287
+ }
288
+
289
+ const remaining = BUFFER.length - bufferPointer;
290
+ if (buffer.length >= remaining) {
291
+ // The buffer is larger than (or equal to) the remaining space in the global buffer,
292
+ // so we perform transcription and copy the overflow to the global buffer
293
+ BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
294
+ bufferPointer += remaining;
295
+
296
+ // Dispatch the audio buffer
297
+ const overflow = buffer.subarray(remaining);
298
+ dispatchForTranscriptionAndResetAudioBuffer(overflow);
299
+ return;
300
+ } else {
301
+ // The buffer is smaller than the remaining space in the global buffer,
302
+ // so we copy it to the global buffer
303
+ BUFFER.set(buffer, bufferPointer);
304
+ bufferPointer += buffer.length;
305
+ }
306
+
307
+ if (isSpeech) {
308
+ if (!isRecording) {
309
+ // Indicate start of recording
310
+ self.postMessage({
311
+ type: "status",
312
+ status: "recording_start",
313
+ message: "Listening...",
314
+ duration: "until_next",
315
+ });
316
+ }
317
+ // Start or continue recording
318
+ isRecording = true;
319
+ postSpeechSamples = 0; // Reset the post-speech samples
320
+ return;
321
+ }
322
+
323
+ postSpeechSamples += buffer.length;
324
+
325
+ // At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
326
+ // So, we check whether we have reached the end of the current audio chunk.
327
+ if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
328
+ // There was a short pause, but not long enough to consider the end of a speech chunk
329
+ // (e.g., the speaker took a breath), so we continue recording
330
+ return;
331
+ }
332
+
333
+ if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
334
+ // The entire buffer (including the new chunk) is smaller than the minimum
335
+ // duration of a speech chunk, so we can safely discard the buffer.
336
+ resetAfterRecording();
337
+ return;
338
+ }
339
+
340
+ dispatchForTranscriptionAndResetAudioBuffer();
341
+ };
342
+
343
+ function greet(text) {
344
+ isPlaying = true;
345
+ const splitter = new TextSplitterStream();
346
+ const stream = tts.stream(splitter, { voice });
347
+ (async () => {
348
+ for await (const { text: chunkText, audio } of stream) {
349
+ self.postMessage({ type: "output", text: chunkText, result: audio });
350
+ }
351
+ })();
352
+ splitter.push(text);
353
+ splitter.close();
354
+ messages.push({ role: "assistant", content: text });
355
+ }
vite.config.js ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from "vite";
2
+ import react from "@vitejs/plugin-react";
3
+ import tailwindcss from "@tailwindcss/vite";
4
+
5
+ // https://vite.dev/config/
6
+ export default defineConfig({
7
+ plugins: [tailwindcss(), react()],
8
+ build: {
9
+ target: "esnext",
10
+ },
11
+ worker: {
12
+ format: "es",
13
+ },
14
+ resolve: {
15
+ // Only bundle a single instance of Transformers.js
16
+ // (shared by `@huggingface/transformers` and `kokoro-js`)
17
+ dedupe: ["@huggingface/transformers"],
18
+ },
19
+ });