Spaces:

webml-community
/

conversational-webgpu

Running

App Files Files Community

Xenova HF Staff commited on Jun 4

Commit

022e8a2

verified ·

1 Parent(s): 74cbdd4

Delete public (#1)

Browse files

- Delete public (2ae8aeb91c8539d37a7f021d8a8024f707ecd3fa)
- Delete src (8834b4be1054e7ed780e989612bc43170c471e65)
- Upload 13 files (e9129d09bdf6b8ccb62bc10d6e00275b93799045)
- Update README.md (8b9dc4abe37c86e4fd27085965f53c81e7955484)
- Update README.md (0570aa94277174cb15ba37546f0bbc5b3abbe378)

Files changed (28) hide show

.gitattributes +1 -0
README.md +3 -74
eslint.config.js +33 -0
index.html +13 -0
package-lock.json +0 -0
package.json +25 -32
public/favicon.ico +0 -0
public/index.html +0 -43
public/logo.png +3 -0
public/logo192.png +0 -0
public/logo512.png +0 -0
public/manifest.json +0 -25
public/robots.txt +0 -3
src/App.css +0 -38
src/App.js +0 -25
src/App.jsx +367 -0
src/App.test.js +0 -8
src/constants.js +53 -0
src/index.css +10 -11
src/index.js +0 -17
src/logo.svg +0 -1
src/main.jsx +10 -0
src/play-worklet.js +73 -0
src/reportWebVitals.js +0 -13
src/setupTests.js +0 -5
src/vad-processor.js +37 -0
src/worker.js +355 -0
vite.config.js +19 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+public/logo.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,81 +1,10 @@
 ---
-title: Speech To Speech Webgpu
-emoji: 🐠
 colorFrom: indigo
 colorTo: red
 sdk: static
 pinned: false
 app_build_command: npm run build
-app_file: build/index.html
 ---
-# Getting Started with Create React App
-This project was bootstrapped with [Create React App](https://github.com/facebook/create-react-app).
-## Available Scripts
-In the project directory, you can run:
-### `npm start`
-Runs the app in the development mode.\
-Open [http://localhost:3000](http://localhost:3000) to view it in your browser.
-The page will reload when you make changes.\
-You may also see any lint errors in the console.
-### `npm test`
-Launches the test runner in the interactive watch mode.\
-See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) for more information.
-### `npm run build`
-Builds the app for production to the `build` folder.\
-It correctly bundles React in production mode and optimizes the build for the best performance.
-The build is minified and the filenames include the hashes.\
-Your app is ready to be deployed!
-See the section about [deployment](https://facebook.github.io/create-react-app/docs/deployment) for more information.
-### `npm run eject`
-**Note: this is a one-way operation. Once you `eject`, you can't go back!**
-If you aren't satisfied with the build tool and configuration choices, you can `eject` at any time. This command will remove the single build dependency from your project.
-Instead, it will copy all the configuration files and the transitive dependencies (webpack, Babel, ESLint, etc) right into your project so you have full control over them. All of the commands except `eject` will still work, but they will point to the copied scripts so you can tweak them. At this point you're on your own.
-You don't have to ever use `eject`. The curated feature set is suitable for small and middle deployments, and you shouldn't feel obligated to use this feature. However we understand that this tool wouldn't be useful if you couldn't customize it when you are ready for it.
-## Learn More
-You can learn more in the [Create React App documentation](https://facebook.github.io/create-react-app/docs/getting-started).
-To learn React, check out the [React documentation](https://reactjs.org/).
-### Code Splitting
-This section has moved here: [https://facebook.github.io/create-react-app/docs/code-splitting](https://facebook.github.io/create-react-app/docs/code-splitting)
-### Analyzing the Bundle Size
-This section has moved here: [https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size](https://facebook.github.io/create-react-app/docs/analyzing-the-bundle-size)
-### Making a Progressive Web App
-This section has moved here: [https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app](https://facebook.github.io/create-react-app/docs/making-a-progressive-web-app)
-### Advanced Configuration
-This section has moved here: [https://facebook.github.io/create-react-app/docs/advanced-configuration](https://facebook.github.io/create-react-app/docs/advanced-configuration)
-### Deployment
-This section has moved here: [https://facebook.github.io/create-react-app/docs/deployment](https://facebook.github.io/create-react-app/docs/deployment)
-### `npm run build` fails to minify
-This section has moved here: [https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify](https://facebook.github.io/create-react-app/docs/troubleshooting#npm-run-build-fails-to-minify)

 ---
+title: Speech To Speech WebGPU
+emoji: 🗣️
 colorFrom: indigo
 colorTo: red
 sdk: static
 pinned: false
 app_build_command: npm run build
+app_file: dist/index.html
 ---

eslint.config.js ADDED Viewed

	@@ -0,0 +1,33 @@

+import js from "@eslint/js";
+import globals from "globals";
+import reactHooks from "eslint-plugin-react-hooks";
+import reactRefresh from "eslint-plugin-react-refresh";
+export default [
+  { ignores: ["dist"] },
+  {
+    files: ["**/*.{js,jsx}"],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+      parserOptions: {
+        ecmaVersion: "latest",
+        ecmaFeatures: { jsx: true },
+        sourceType: "module",
+      },
+    },
+    plugins: {
+      "react-hooks": reactHooks,
+      "react-refresh": reactRefresh,
+    },
+    rules: {
+      ...js.configs.recommended.rules,
+      ...reactHooks.configs.recommended.rules,
+      "no-unused-vars": ["error", { varsIgnorePattern: "^[A-Z_]" }],
+      "react-refresh/only-export-components": [
+        "warn",
+        { allowConstantExport: true },
+      ],
+    },
+  },
+];

index.html ADDED Viewed

	@@ -0,0 +1,13 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/png" href="/logo.png" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Transformers.js | Speech-to-speech demo</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.jsx"></script>
+  </body>
+</html>

package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

package.json CHANGED Viewed

@@ -1,39 +1,32 @@
 {
-  "name": "react-template",
-  "version": "0.1.0",
   "private": true,
-  "dependencies": {
-    "@testing-library/dom": "^10.4.0",
-    "@testing-library/jest-dom": "^6.6.3",
-    "@testing-library/react": "^16.3.0",
-    "@testing-library/user-event": "^13.5.0",
-    "react": "^19.1.0",
-    "react-dom": "^19.1.0",
-    "react-scripts": "5.0.1",
-    "web-vitals": "^2.1.4"
-  },
   "scripts": {
-    "start": "react-scripts start",
-    "build": "react-scripts build",
-    "test": "react-scripts test",
-    "eject": "react-scripts eject"
   },
-  "eslintConfig": {
-    "extends": [
-      "react-app",
-      "react-app/jest"
-    ]
   },
-  "browserslist": {
-    "production": [
-      ">0.2%",
-      "not dead",
-      "not op_mini all"
-    ],
-    "development": [
-      "last 1 chrome version",
-      "last 1 firefox version",
-      "last 1 safari version"
-    ]
   }
 }

 {
+  "name": "speech-to-speech",
   "private": true,
+  "version": "0.0.0",
+  "type": "module",
   "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "lint": "eslint .",
+    "preview": "vite preview"
   },
+  "dependencies": {
+    "@huggingface/transformers": "^3.5.2",
+    "@tailwindcss/vite": "^4.1.4",
+    "kokoro-js": "^1.2.1",
+    "lucide-react": "^0.503.0",
+    "react": "^19.0.0",
+    "react-dom": "^19.0.0",
+    "tailwindcss": "^4.1.4"
   },
+  "devDependencies": {
+    "@eslint/js": "^9.22.0",
+    "@types/react": "^19.0.10",
+    "@types/react-dom": "^19.0.4",
+    "@vitejs/plugin-react": "^4.3.4",
+    "eslint": "^9.22.0",
+    "eslint-plugin-react-hooks": "^5.2.0",
+    "eslint-plugin-react-refresh": "^0.4.19",
+    "globals": "^16.0.0",
+    "vite": "^6.3.1"
   }
 }

public/favicon.ico DELETED Viewed

Binary file (3.87 kB)

public/index.html DELETED Viewed

@@ -1,43 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-  <head>
-    <meta charset="utf-8" />
-    <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
-    <meta name="viewport" content="width=device-width, initial-scale=1" />
-    <meta name="theme-color" content="#000000" />
-    <meta
-      name="description"
-      content="Web site created using create-react-app"
-    />
-    <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
-    <!--
-      manifest.json provides metadata used when your web app is installed on a
-      user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/
-    -->
-    <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
-    <!--
-      Notice the use of %PUBLIC_URL% in the tags above.
-      It will be replaced with the URL of the `public` folder during the build.
-      Only files inside the `public` folder can be referenced from the HTML.
-      Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
-      work correctly both with client-side routing and a non-root public URL.
-      Learn how to configure a non-root public URL by running `npm run build`.
-    -->
-    <title>React App</title>
-  </head>
-  <body>
-    <noscript>You need to enable JavaScript to run this app.</noscript>
-    <div id="root"></div>
-    <!--
-      This HTML file is a template.
-      If you open it directly in the browser, you will see an empty page.
-      You can add webfonts, meta tags, or analytics to this file.
-      The build step will place the bundled scripts into the <body> tag.
-      To begin the development, run `npm start` or `yarn start`.
-      To create a production bundle, use `npm run build` or `yarn build`.
-    -->
-  </body>
-</html>

public/logo.png ADDED Viewed

Git LFS Details

SHA256: 9fb2bd90d1eeab88414681bb80464bc723ab57fb2c5b2f33367f16a0157ed5c0
Pointer size: 131 Bytes
Size of remote file: 634 kB

public/logo192.png DELETED Viewed

Binary file (5.35 kB)

public/logo512.png DELETED Viewed

Binary file (9.66 kB)

public/manifest.json DELETED Viewed

@@ -1,25 +0,0 @@
-{
-  "short_name": "React App",
-  "name": "Create React App Sample",
-  "icons": [
-    {
-      "src": "favicon.ico",
-      "sizes": "64x64 32x32 24x24 16x16",
-      "type": "image/x-icon"
-    },
-    {
-      "src": "logo192.png",
-      "type": "image/png",
-      "sizes": "192x192"
-    },
-    {
-      "src": "logo512.png",
-      "type": "image/png",
-      "sizes": "512x512"
-    }
-  ],
-  "start_url": ".",
-  "display": "standalone",
-  "theme_color": "#000000",
-  "background_color": "#ffffff"
-}

public/robots.txt DELETED Viewed

@@ -1,3 +0,0 @@
-# https://www.robotstxt.org/robotstxt.html
-User-agent: *
-Disallow:

src/App.css DELETED Viewed

@@ -1,38 +0,0 @@
-.App {
-  text-align: center;
-}
-.App-logo {
-  height: 40vmin;
-  pointer-events: none;
-}
-@media (prefers-reduced-motion: no-preference) {
-  .App-logo {
-    animation: App-logo-spin infinite 20s linear;
-  }
-}
-.App-header {
-  background-color: #282c34;
-  min-height: 100vh;
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  font-size: calc(10px + 2vmin);
-  color: white;
-}
-.App-link {
-  color: #61dafb;
-}
-@keyframes App-logo-spin {
-  from {
-    transform: rotate(0deg);
-  }
-  to {
-    transform: rotate(360deg);
-  }
-}

src/App.js DELETED Viewed

@@ -1,25 +0,0 @@
-import logo from './logo.svg';
-import './App.css';
-function App() {
-  return (
-    <div className="App">
-      <header className="App-header">
-        <img src={logo} className="App-logo" alt="logo" />
-        <p>
-          Edit <code>src/App.js</code> and save to reload.
-        </p>
-        <a
-          className="App-link"
-          href="https://reactjs.org"
-          target="_blank"
-          rel="noopener noreferrer"
-        >
-          Learn React
-        </a>
-      </header>
-    </div>
-  );
-}
-export default App;

src/App.jsx ADDED Viewed

	@@ -0,0 +1,367 @@

+import { useEffect, useState, useRef } from "react";
+import { Mic, PhoneOff, ChevronDown } from "lucide-react";
+import { INPUT_SAMPLE_RATE } from "./constants";
+import WORKLET from "./play-worklet.js";
+export default function App() {
+  const [callStartTime, setCallStartTime] = useState(null);
+  const [callStarted, setCallStarted] = useState(false);
+  const [playing, setPlaying] = useState(false);
+  const [voice, setVoice] = useState("af_heart");
+  const [voices, setVoices] = useState([]);
+  const [isListening, setIsListening] = useState(false);
+  const [isSpeaking, setIsSpeaking] = useState(false);
+  const [listeningScale, setListeningScale] = useState(1);
+  const [speakingScale, setSpeakingScale] = useState(1);
+  const [ripples, setRipples] = useState([]);
+  const [ready, setReady] = useState(false);
+  const [error, setError] = useState(null);
+  const [elapsedTime, setElapsedTime] = useState("00:00");
+  const worker = useRef(null);
+  const node = useRef(null);
+  useEffect(() => {
+    worker.current?.postMessage({
+      type: "set_voice",
+      voice,
+    });
+  }, [voice]);
+  useEffect(() => {
+    if (!callStarted) {
+      // Reset worker state after call ends
+      worker.current?.postMessage({
+        type: "end_call",
+      });
+    }
+  }, [callStarted]);
+  useEffect(() => {
+    if (callStarted && callStartTime) {
+      const interval = setInterval(() => {
+        const diff = Math.floor((Date.now() - callStartTime) / 1000);
+        const minutes = String(Math.floor(diff / 60)).padStart(2, "0");
+        const seconds = String(diff % 60).padStart(2, "0");
+        setElapsedTime(`${minutes}:${seconds}`);
+      }, 1000);
+      return () => clearInterval(interval);
+    } else {
+      setElapsedTime("00:00");
+    }
+  }, [callStarted, callStartTime]);
+  useEffect(() => {
+    worker.current ??= new Worker(new URL("./worker.js", import.meta.url), {
+      type: "module",
+    });
+    const onMessage = ({ data }) => {
+      console.log("Worker message:", data);
+      if (data.error) {
+        return onError(data.error);
+      }
+      switch (data.type) {
+        case "status":
+          if (data.status === "recording_start") {
+            setIsListening(true);
+            setIsSpeaking(false);
+          } else if (data.status === "recording_end") {
+            setIsListening(false);
+          } else if (data.status === "ready") {
+            setVoices(data.voices);
+            setReady(true);
+          }
+          break;
+        case "output":
+          if (!playing) {
+            node.current?.port.postMessage(data.result.audio);
+            setPlaying(true);
+            setIsSpeaking(true);
+            setIsListening(false);
+          }
+          break;
+      }
+    };
+    const onError = (err) => setError(err.message);
+    worker.current.addEventListener("message", onMessage);
+    worker.current.addEventListener("error", onError);
+    return () => {
+      worker.current.removeEventListener("message", onMessage);
+      worker.current.removeEventListener("error", onError);
+    };
+  }, []);
+  useEffect(() => {
+    if (!callStarted) return;
+    let worklet;
+    let inputAudioContext;
+    let source;
+    let ignore = false;
+    let outputAudioContext;
+    const audioStreamPromise = navigator.mediaDevices.getUserMedia({
+      audio: {
+        channelCount: 1,
+        echoCancellation: true,
+        autoGainControl: true,
+        noiseSuppression: true,
+        sampleRate: INPUT_SAMPLE_RATE,
+      },
+    });
+    audioStreamPromise
+      .then(async (stream) => {
+        if (ignore) return;
+        inputAudioContext = new (window.AudioContext ||
+          window.webkitAudioContext)({
+          sampleRate: INPUT_SAMPLE_RATE,
+        });
+        const analyser = inputAudioContext.createAnalyser();
+        analyser.fftSize = 256;
+        source = inputAudioContext.createMediaStreamSource(stream);
+        source.connect(analyser);
+        const inputDataArray = new Uint8Array(analyser.frequencyBinCount);
+        function calculateRMS(array) {
+          let sum = 0;
+          for (let i = 0; i < array.length; ++i) {
+            const normalized = array[i] / 128 - 1;
+            sum += normalized * normalized;
+          }
+          const rms = Math.sqrt(sum / array.length);
+          return rms;
+        }
+        await inputAudioContext.audioWorklet.addModule(
+          new URL("./vad-processor.js", import.meta.url),
+        );
+        worklet = new AudioWorkletNode(inputAudioContext, "vad-processor", {
+          numberOfInputs: 1,
+          numberOfOutputs: 0,
+          channelCount: 1,
+          channelCountMode: "explicit",
+          channelInterpretation: "discrete",
+        });
+        source.connect(worklet);
+        worklet.port.onmessage = (event) => {
+          const { buffer } = event.data;
+          worker.current?.postMessage({ type: "audio", buffer });
+        };
+        outputAudioContext = new AudioContext({
+          sampleRate: 24000,
+        });
+        outputAudioContext.resume();
+        const blob = new Blob([`(${WORKLET.toString()})()`], {
+          type: "application/javascript",
+        });
+        const url = URL.createObjectURL(blob);
+        await outputAudioContext.audioWorklet.addModule(url);
+        URL.revokeObjectURL(url);
+        node.current = new AudioWorkletNode(
+          outputAudioContext,
+          "buffered-audio-worklet-processor",
+        );
+        node.current.port.onmessage = (event) => {
+          if (event.data.type === "playback_ended") {
+            setPlaying(false);
+            setIsSpeaking(false);
+            worker.current?.postMessage({ type: "playback_ended" });
+          }
+        };
+        const outputAnalyser = outputAudioContext.createAnalyser();
+        outputAnalyser.fftSize = 256;
+        node.current.connect(outputAnalyser);
+        outputAnalyser.connect(outputAudioContext.destination);
+        const outputDataArray = new Uint8Array(
+          outputAnalyser.frequencyBinCount,
+        );
+        function updateVisualizers() {
+          analyser.getByteTimeDomainData(inputDataArray);
+          const rms = calculateRMS(inputDataArray);
+          const targetScale = 1 + Math.min(1.25 * rms, 0.25);
+          setListeningScale((prev) => prev + (targetScale - prev) * 0.25);
+          outputAnalyser.getByteTimeDomainData(outputDataArray);
+          const outputRMS = calculateRMS(outputDataArray);
+          const targetOutputScale = 1 + Math.min(1.25 * outputRMS, 0.25);
+          setSpeakingScale((prev) => prev + (targetOutputScale - prev) * 0.25);
+          requestAnimationFrame(updateVisualizers);
+        }
+        updateVisualizers();
+      })
+      .catch((err) => {
+        setError(err.message);
+        console.error(err);
+      });
+    return () => {
+      ignore = true;
+      audioStreamPromise.then((stream) =>
+        stream.getTracks().forEach((track) => track.stop()),
+      );
+      source?.disconnect();
+      worklet?.disconnect();
+      inputAudioContext?.close();
+      outputAudioContext?.close();
+    };
+  }, [callStarted]);
+  useEffect(() => {
+    if (!callStarted) return;
+    const interval = setInterval(() => {
+      const id = Date.now();
+      setRipples((prev) => [...prev, id]);
+      setTimeout(() => {
+        setRipples((prev) => prev.filter((r) => r !== id));
+      }, 1500);
+    }, 1000);
+    return () => clearInterval(interval);
+  }, [callStarted]);
+  return (
+    <div className="h-screen min-h-[240px] flex items-center justify-center bg-gray-50 p-4 relative">
+      <div className="h-full max-h-[320px] w-[640px] bg-white rounded-xl shadow-lg p-8 flex items-center justify-between space-x-16">
+        <div className="text-green-700 w-[140px]">
+          <div className="text-xl font-bold flex justify-between">
+            {voices?.[voice]?.name}
+            <span className="font-normal text-gray-500">{elapsedTime}</span>
+          </div>
+          <div className="text-base relative">
+            <button
+              type="button"
+              disabled={!ready}
+              className={`w-full flex items-center justify-between border border-gray-300 rounded-md transition-colors ${
+                ready
+                  ? "bg-transparent hover:border-gray-400"
+                  : "bg-gray-100 opacity-50 cursor-not-allowed"
+              }`}
+            >
+              <span className="px-2 py-1">Select voice</span>
+              <ChevronDown className="absolute right-2" />
+            </button>
+            <select
+              value={voice}
+              onChange={(e) => setVoice(e.target.value)}
+              className="absolute inset-0 opacity-0 cursor-pointer"
+              disabled={!ready}
+            >
+              {Object.entries(voices).map(([key, v]) => (
+                <option key={key} value={key}>
+                  {`${v.name} (${
+                    v.language === "en-us" ? "American" : v.language
+                  } ${v.gender})`}
+                </option>
+              ))}
+            </select>
+          </div>
+        </div>
+        <div className="relative flex items-center justify-center w-32 h-32 flex-shrink-0 aspect-square">
+          {callStarted &&
+            ripples.map((id) => (
+              <div
+                key={id}
+                className="absolute inset-0 rounded-full border-2 border-green-200 pointer-events-none"
+                style={{ animation: "ripple 1.5s ease-out forwards" }}
+              />
+            ))}
+          <div className="absolute z-10 text-lg text-gray-700">
+            {!ready ? "Loading..." : ""}
+            {isListening && "Listening..."}
+            {isSpeaking && "Speaking..."}
+          </div>
+          {/* Pulsing loader while initializing */}
+          <div
+            className={`absolute w-32 h-32 rounded-full bg-green-200 ${
+              !ready ? "animate-ping opacity-75" : ""
+            }`}
+            style={{ animationDuration: "1.5s" }}
+          />
+          {/* Main rings */}
+          <div
+            className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-300 ${
+              !ready ? "opacity-0" : ""
+            }`}
+            style={{ transform: `scale(${speakingScale})` }}
+          />
+          <div
+            className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-200 ${
+              !ready ? "opacity-0" : ""
+            }`}
+            style={{ transform: `scale(${listeningScale})` }}
+          />
+        </div>
+        <div className="space-y-4 w-[140px]">
+          {callStarted ? (
+            <button
+              className="flex items-center space-x-2 px-4 py-2 bg-red-100 text-red-700 rounded-md hover:bg-red-200"
+              onClick={() => {
+                setCallStarted(false);
+                setCallStartTime(null);
+                setPlaying(false);
+                setIsListening(false);
+                setIsSpeaking(false);
+              }}
+            >
+              <PhoneOff className="w-5 h-5" />
+              <span>End call</span>
+            </button>
+          ) : (
+            <button
+              className={`flex items-center space-x-2 px-4 py-2 rounded-md ${
+                ready
+                  ? "bg-blue-100 text-blue-700 hover:bg-blue-200"
+                  : "bg-blue-100 text-blue-700 opacity-50 cursor-not-allowed"
+              }`}
+              onClick={() => {
+                setCallStartTime(Date.now());
+                setCallStarted(true);
+                worker.current?.postMessage({ type: "start_call" });
+              }}
+              disabled={!ready}
+            >
+              <span>Start call</span>
+            </button>
+          )}
+        </div>
+      </div>
+      <div className="absolute bottom-4 text-sm">
+        Built with{" "}
+        <a
+          href="https://github.com/huggingface/transformers.js"
+          rel="noopener noreferrer"
+          target="_blank"
+          className="text-blue-600 hover:underline"
+        >
+          🤗 Transformers.js
+        </a>
+      </div>
+    </div>
+  );
+}

src/App.test.js DELETED Viewed

@@ -1,8 +0,0 @@
-import { render, screen } from '@testing-library/react';
-import App from './App';
-test('renders learn react link', () => {
-  render(<App />);
-  const linkElement = screen.getByText(/learn react/i);
-  expect(linkElement).toBeInTheDocument();
-});

src/constants.js ADDED Viewed

	@@ -0,0 +1,53 @@

+/**
+ * Sample rate of the input audio.
+ * Coindicentally, this is the same for both models (Moonshine and Silero VAD)
+ */
+export const INPUT_SAMPLE_RATE = 16000;
+const INPUT_SAMPLE_RATE_MS = INPUT_SAMPLE_RATE / 1000;
+/**
+ * Probabilities ABOVE this value are considered as SPEECH
+ */
+export const SPEECH_THRESHOLD = 0.3;
+/**
+ * If current state is SPEECH, and the probability of the next state
+ * is below this value, it is considered as NON-SPEECH.
+ */
+export const EXIT_THRESHOLD = 0.1;
+/**
+ * After each speech chunk, wait for at least this amount of silence
+ * before considering the next chunk as a new speech chunk
+ */
+export const MIN_SILENCE_DURATION_MS = 400;
+export const MIN_SILENCE_DURATION_SAMPLES =
+  MIN_SILENCE_DURATION_MS * INPUT_SAMPLE_RATE_MS;
+/**
+ * Pad the speech chunk with this amount each side
+ */
+export const SPEECH_PAD_MS = 80;
+export const SPEECH_PAD_SAMPLES = SPEECH_PAD_MS * INPUT_SAMPLE_RATE_MS;
+/**
+ * Final speech chunks below this duration are discarded
+ */
+export const MIN_SPEECH_DURATION_SAMPLES = 250 * INPUT_SAMPLE_RATE_MS; // 250 ms
+/**
+ * Maximum duration of audio that can be handled by Moonshine
+ */
+export const MAX_BUFFER_DURATION = 30;
+/**
+ * Size of the incoming buffers
+ */
+export const NEW_BUFFER_SIZE = 512;
+/**
+ * The number of previous buffers to keep, to ensure the audio is padded correctly
+ */
+export const MAX_NUM_PREV_BUFFERS = Math.ceil(
+  SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE,
+);

src/index.css CHANGED Viewed

@@ -1,13 +1,12 @@
-body {
-  margin: 0;
-  font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
-    'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
-    sans-serif;
-  -webkit-font-smoothing: antialiased;
-  -moz-osx-font-smoothing: grayscale;
-}
-code {
-  font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
-    monospace;
 }

+@import "tailwindcss";
+@keyframes ripple {
+  from {
+    transform: scale(1);
+    opacity: 0.7;
+  }
+  to {
+    transform: scale(2);
+    opacity: 0;
+  }
 }

src/index.js DELETED Viewed

@@ -1,17 +0,0 @@
-import React from 'react';
-import ReactDOM from 'react-dom/client';
-import './index.css';
-import App from './App';
-import reportWebVitals from './reportWebVitals';
-const root = ReactDOM.createRoot(document.getElementById('root'));
-root.render(
-  <React.StrictMode>
-    <App />
-  </React.StrictMode>
-);
-// If you want to start measuring performance in your app, pass a function
-// to log results (for example: reportWebVitals(console.log))
-// or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
-reportWebVitals();

src/logo.svg DELETED Viewed

src/main.jsx ADDED Viewed

	@@ -0,0 +1,10 @@

+import { StrictMode } from "react";
+import { createRoot } from "react-dom/client";
+import "./index.css";
+import App from "./App.jsx";
+createRoot(document.getElementById("root")).render(
+  <StrictMode>
+    <App />
+  </StrictMode>,
+);

src/play-worklet.js ADDED Viewed

	@@ -0,0 +1,73 @@

+export default () => {
+  class BufferedAudioWorkletProcessor extends AudioWorkletProcessor {
+    constructor() {
+      super();
+      this.bufferQueue = [];
+      this.currentChunkOffset = 0;
+      this.hadData = false;
+      this.port.onmessage = (event) => {
+        const data = event.data;
+        if (data instanceof Float32Array) {
+          this.hadData = true;
+          this.bufferQueue.push(data);
+        } else if (data === "stop") {
+          this.bufferQueue = [];
+          this.currentChunkOffset = 0;
+        }
+      };
+    }
+    process(inputs, outputs) {
+      const channel = outputs[0][0];
+      if (!channel) return true;
+      const numSamples = channel.length;
+      let outputIndex = 0;
+      if (this.hadData && this.bufferQueue.length === 0) {
+        this.port.postMessage({ type: "playback_ended" });
+        this.hadData = false;
+      }
+      while (outputIndex < numSamples) {
+        if (this.bufferQueue.length > 0) {
+          const currentChunk = this.bufferQueue[0];
+          const remainingSamples =
+            currentChunk.length - this.currentChunkOffset;
+          const samplesToCopy = Math.min(
+            remainingSamples,
+            numSamples - outputIndex,
+          );
+          channel.set(
+            currentChunk.subarray(
+              this.currentChunkOffset,
+              this.currentChunkOffset + samplesToCopy,
+            ),
+            outputIndex,
+          );
+          this.currentChunkOffset += samplesToCopy;
+          outputIndex += samplesToCopy;
+          // Remove the chunk if fully consumed.
+          if (this.currentChunkOffset >= currentChunk.length) {
+            this.bufferQueue.shift();
+            this.currentChunkOffset = 0;
+          }
+        } else {
+          // If no data is available, fill the rest of the buffer with silence.
+          channel.fill(0, outputIndex);
+          outputIndex = numSamples;
+        }
+      }
+      return true;
+    }
+  }
+  registerProcessor(
+    "buffered-audio-worklet-processor",
+    BufferedAudioWorkletProcessor,
+  );
+};

src/reportWebVitals.js DELETED Viewed

@@ -1,13 +0,0 @@
-const reportWebVitals = onPerfEntry => {
-  if (onPerfEntry && onPerfEntry instanceof Function) {
-    import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
-      getCLS(onPerfEntry);
-      getFID(onPerfEntry);
-      getFCP(onPerfEntry);
-      getLCP(onPerfEntry);
-      getTTFB(onPerfEntry);
-    });
-  }
-};
-export default reportWebVitals;

src/setupTests.js DELETED Viewed

@@ -1,5 +0,0 @@
-// jest-dom adds custom jest matchers for asserting on DOM nodes.
-// allows you to do things like:
-// expect(element).toHaveTextContent(/react/i)
-// learn more: https://github.com/testing-library/jest-dom
-import '@testing-library/jest-dom';

src/vad-processor.js ADDED Viewed

	@@ -0,0 +1,37 @@

+const MIN_CHUNK_SIZE = 512;
+let globalPointer = 0;
+let globalBuffer = new Float32Array(MIN_CHUNK_SIZE);
+class VADProcessor extends AudioWorkletProcessor {
+  process(inputs, outputs, parameters) {
+    const buffer = inputs[0][0];
+    if (!buffer) return; // buffer is null when the stream ends
+    if (buffer.length > MIN_CHUNK_SIZE) {
+      // If the buffer is larger than the minimum chunk size, send the entire buffer
+      this.port.postMessage({ buffer });
+    } else {
+      const remaining = MIN_CHUNK_SIZE - globalPointer;
+      if (buffer.length >= remaining) {
+        // If the buffer is larger than (or equal to) the remaining space in the global buffer, copy the remaining space
+        globalBuffer.set(buffer.subarray(0, remaining), globalPointer);
+        // Send the global buffer
+        this.port.postMessage({ buffer: globalBuffer });
+        // Reset the global buffer and set the remaining buffer
+        globalBuffer.fill(0);
+        globalBuffer.set(buffer.subarray(remaining), 0);
+        globalPointer = buffer.length - remaining;
+      } else {
+        // If the buffer is smaller than the remaining space in the global buffer, copy the buffer to the global buffer
+        globalBuffer.set(buffer, globalPointer);
+        globalPointer += buffer.length;
+      }
+    }
+    return true; // Keep the processor alive
+  }
+}
+registerProcessor("vad-processor", VADProcessor);

src/worker.js ADDED Viewed

	@@ -0,0 +1,355 @@

+import {
+  // VAD
+  AutoModel,
+  // LLM
+  AutoTokenizer,
+  AutoModelForCausalLM,
+  TextStreamer,
+  InterruptableStoppingCriteria,
+  // Speech recognition
+  Tensor,
+  pipeline,
+} from "@huggingface/transformers";
+import { KokoroTTS, TextSplitterStream } from "kokoro-js";
+import {
+  MAX_BUFFER_DURATION,
+  INPUT_SAMPLE_RATE,
+  SPEECH_THRESHOLD,
+  EXIT_THRESHOLD,
+  SPEECH_PAD_SAMPLES,
+  MAX_NUM_PREV_BUFFERS,
+  MIN_SILENCE_DURATION_SAMPLES,
+  MIN_SPEECH_DURATION_SAMPLES,
+} from "./constants";
+const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
+let voice;
+const tts = await KokoroTTS.from_pretrained(model_id, {
+  dtype: "fp32",
+  device: "webgpu",
+});
+const device = "webgpu";
+self.postMessage({ type: "info", message: `Using device: "${device}"` });
+self.postMessage({
+  type: "info",
+  message: "Loading models...",
+  duration: "until_next",
+});
+// Load models
+const silero_vad = await AutoModel.from_pretrained(
+  "onnx-community/silero-vad",
+  {
+    config: { model_type: "custom" },
+    dtype: "fp32", // Full-precision
+  },
+).catch((error) => {
+  self.postMessage({ error });
+  throw error;
+});
+const DEVICE_DTYPE_CONFIGS = {
+  webgpu: {
+    encoder_model: "fp32",
+    decoder_model_merged: "fp32",
+  },
+  wasm: {
+    encoder_model: "fp32",
+    decoder_model_merged: "q8",
+  },
+};
+const transcriber = await pipeline(
+  "automatic-speech-recognition",
+  "onnx-community/whisper-base", // or "onnx-community/moonshine-base-ONNX",
+  {
+    device,
+    dtype: DEVICE_DTYPE_CONFIGS[device],
+  },
+).catch((error) => {
+  self.postMessage({ error });
+  throw error;
+});
+await transcriber(new Float32Array(INPUT_SAMPLE_RATE)); // Compile shaders
+const llm_model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct";
+const tokenizer = await AutoTokenizer.from_pretrained(llm_model_id);
+const llm = await AutoModelForCausalLM.from_pretrained(llm_model_id, {
+  dtype: "q4f16",
+  device: "webgpu",
+});
+const SYSTEM_MESSAGE = {
+  role: "system",
+  content:
+    "You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual.",
+};
+await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 }); // Compile shaders
+let messages = [SYSTEM_MESSAGE];
+let past_key_values_cache;
+let stopping_criteria;
+self.postMessage({
+  type: "status",
+  status: "ready",
+  message: "Ready!",
+  voices: tts.voices,
+});
+// Global audio buffer to store incoming audio
+const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
+let bufferPointer = 0;
+// Initial state for VAD
+const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
+let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
+// Whether we are in the process of adding audio to the buffer
+let isRecording = false;
+let isPlaying = false; // new flag
+/**
+ * Perform Voice Activity Detection (VAD)
+ * @param {Float32Array} buffer The new audio buffer
+ * @returns {Promise<boolean>} `true` if the buffer is speech, `false` otherwise.
+ */
+async function vad(buffer) {
+  const input = new Tensor("float32", buffer, [1, buffer.length]);
+  const { stateN, output } = await silero_vad({ input, sr, state });
+  state = stateN; // Update state
+  const isSpeech = output.data[0];
+  // Use heuristics to determine if the buffer is speech or not
+  return (
+    // Case 1: We are above the threshold (definitely speech)
+    isSpeech > SPEECH_THRESHOLD ||
+    // Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
+    (isRecording && isSpeech >= EXIT_THRESHOLD)
+  );
+}
+/**
+ * Transcribe the audio buffer
+ * @param {Float32Array} buffer The audio buffer
+ * @param {Object} data Additional data
+ */
+const speechToSpeech = async (buffer, data) => {
+  isPlaying = true;
+  // 1. Transcribe the audio from the user
+  const text = await transcriber(buffer).then(({ text }) => text.trim());
+  if (["", "[BLANK_AUDIO]"].includes(text)) {
+    // If the transcription is empty or a blank audio, we skip the rest of the processing
+    return;
+  }
+  messages.push({ role: "user", content: text });
+  // Set up text-to-speech streaming
+  const splitter = new TextSplitterStream();
+  const stream = tts.stream(splitter, {
+    voice,
+  });
+  (async () => {
+    for await (const { text, phonemes, audio } of stream) {
+      self.postMessage({ type: "output", text, result: audio });
+    }
+  })();
+  // 2. Generate a response using the LLM
+  const inputs = tokenizer.apply_chat_template(messages, {
+    add_generation_prompt: true,
+    return_dict: true,
+  });
+  const streamer = new TextStreamer(tokenizer, {
+    skip_prompt: true,
+    skip_special_tokens: true,
+    callback_function: (text) => {
+      splitter.push(text);
+    },
+    token_callback_function: () => {},
+  });
+  stopping_criteria = new InterruptableStoppingCriteria();
+  const { past_key_values, sequences } = await llm.generate({
+    ...inputs,
+    past_key_values: past_key_values_cache,
+    do_sample: false, // TODO: do_sample: true is bugged (invalid data location on topk sample)
+    max_new_tokens: 1024,
+    streamer,
+    stopping_criteria,
+    return_dict_in_generate: true,
+  });
+  past_key_values_cache = past_key_values;
+  // Finally, close the stream to signal that no more text will be added.
+  splitter.close();
+  const decoded = tokenizer.batch_decode(
+    sequences.slice(null, [inputs.input_ids.dims[1], null]),
+    { skip_special_tokens: true },
+  );
+  messages.push({ role: "assistant", content: decoded[0] });
+};
+// Track the number of samples after the last speech chunk
+let postSpeechSamples = 0;
+const resetAfterRecording = (offset = 0) => {
+  self.postMessage({
+    type: "status",
+    status: "recording_end",
+    message: "Transcribing...",
+    duration: "until_next",
+  });
+  BUFFER.fill(0, offset);
+  bufferPointer = offset;
+  isRecording = false;
+  postSpeechSamples = 0;
+};
+const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
+  // Get start and end time of the speech segment, minus the padding
+  const now = Date.now();
+  const end =
+    now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
+  const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
+  const duration = end - start;
+  const overflowLength = overflow?.length ?? 0;
+  // Send the audio buffer to the worker
+  const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
+  const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
+  const paddedBuffer = new Float32Array(prevLength + buffer.length);
+  let offset = 0;
+  for (const prev of prevBuffers) {
+    paddedBuffer.set(prev, offset);
+    offset += prev.length;
+  }
+  paddedBuffer.set(buffer, offset);
+  speechToSpeech(paddedBuffer, { start, end, duration });
+  // Set overflow (if present) and reset the rest of the audio buffer
+  if (overflow) {
+    BUFFER.set(overflow, 0);
+  }
+  resetAfterRecording(overflowLength);
+};
+let prevBuffers = [];
+self.onmessage = async (event) => {
+  const { type, buffer } = event.data;
+  // refuse new audio while playing back
+  if (type === "audio" && isPlaying) return;
+  switch (type) {
+    case "start_call": {
+      const name = tts.voices[voice ?? "af_heart"]?.name ?? "Heart";
+      greet(`Hey there, my name is ${name}! How can I help you today?`);
+      return;
+    }
+    case "end_call":
+      messages = [SYSTEM_MESSAGE];
+      past_key_values_cache = null;
+    case "interrupt":
+      stopping_criteria?.interrupt();
+      return;
+    case "set_voice":
+      voice = event.data.voice;
+      return;
+    case "playback_ended":
+      isPlaying = false;
+      return;
+  }
+  const wasRecording = isRecording; // Save current state
+  const isSpeech = await vad(buffer);
+  if (!wasRecording && !isSpeech) {
+    // We are not recording, and the buffer is not speech,
+    // so we will probably discard the buffer. So, we insert
+    // into a FIFO queue with maximum size of PREV_BUFFER_SIZE
+    if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
+      // If the queue is full, we discard the oldest buffer
+      prevBuffers.shift();
+    }
+    prevBuffers.push(buffer);
+    return;
+  }
+  const remaining = BUFFER.length - bufferPointer;
+  if (buffer.length >= remaining) {
+    // The buffer is larger than (or equal to) the remaining space in the global buffer,
+    // so we perform transcription and copy the overflow to the global buffer
+    BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
+    bufferPointer += remaining;
+    // Dispatch the audio buffer
+    const overflow = buffer.subarray(remaining);
+    dispatchForTranscriptionAndResetAudioBuffer(overflow);
+    return;
+  } else {
+    // The buffer is smaller than the remaining space in the global buffer,
+    // so we copy it to the global buffer
+    BUFFER.set(buffer, bufferPointer);
+    bufferPointer += buffer.length;
+  }
+  if (isSpeech) {
+    if (!isRecording) {
+      // Indicate start of recording
+      self.postMessage({
+        type: "status",
+        status: "recording_start",
+        message: "Listening...",
+        duration: "until_next",
+      });
+    }
+    // Start or continue recording
+    isRecording = true;
+    postSpeechSamples = 0; // Reset the post-speech samples
+    return;
+  }
+  postSpeechSamples += buffer.length;
+  // At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
+  // So, we check whether we have reached the end of the current audio chunk.
+  if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
+    // There was a short pause, but not long enough to consider the end of a speech chunk
+    // (e.g., the speaker took a breath), so we continue recording
+    return;
+  }
+  if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
+    // The entire buffer (including the new chunk) is smaller than the minimum
+    // duration of a speech chunk, so we can safely discard the buffer.
+    resetAfterRecording();
+    return;
+  }
+  dispatchForTranscriptionAndResetAudioBuffer();
+};
+function greet(text) {
+  isPlaying = true;
+  const splitter = new TextSplitterStream();
+  const stream = tts.stream(splitter, { voice });
+  (async () => {
+    for await (const { text: chunkText, audio } of stream) {
+      self.postMessage({ type: "output", text: chunkText, result: audio });
+    }
+  })();
+  splitter.push(text);
+  splitter.close();
+  messages.push({ role: "assistant", content: text });
+}

vite.config.js ADDED Viewed

	@@ -0,0 +1,19 @@

+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react";
+import tailwindcss from "@tailwindcss/vite";
+// https://vite.dev/config/
+export default defineConfig({
+  plugins: [tailwindcss(), react()],
+  build: {
+    target: "esnext",
+  },
+  worker: {
+    format: "es",
+  },
+  resolve: {
+    // Only bundle a single instance of Transformers.js
+    // (shared by `@huggingface/transformers` and `kokoro-js`)
+    dedupe: ["@huggingface/transformers"],
+  },
+});