Spaces:

webml-community
/

conversational-webgpu

Running

App Files Files Community

Xenova HF Staff commited on Jun 4

Commit

e9129d0

verified ·

1 Parent(s): 8834b4b

Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +1 -0
eslint.config.js +33 -0
index.html +13 -0
package-lock.json +0 -0
package.json +25 -32
public/logo.png +3 -0
src/App.jsx +367 -0
src/constants.js +53 -0
src/index.css +12 -0
src/main.jsx +10 -0
src/play-worklet.js +73 -0
src/vad-processor.js +37 -0
src/worker.js +355 -0
vite.config.js +19 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+public/logo.png filter=lfs diff=lfs merge=lfs -text

eslint.config.js ADDED Viewed

	@@ -0,0 +1,33 @@

+import js from "@eslint/js";
+import globals from "globals";
+import reactHooks from "eslint-plugin-react-hooks";
+import reactRefresh from "eslint-plugin-react-refresh";
+export default [
+  { ignores: ["dist"] },
+  {
+    files: ["**/*.{js,jsx}"],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+      parserOptions: {
+        ecmaVersion: "latest",
+        ecmaFeatures: { jsx: true },
+        sourceType: "module",
+      },
+    },
+    plugins: {
+      "react-hooks": reactHooks,
+      "react-refresh": reactRefresh,
+    },
+    rules: {
+      ...js.configs.recommended.rules,
+      ...reactHooks.configs.recommended.rules,
+      "no-unused-vars": ["error", { varsIgnorePattern: "^[A-Z_]" }],
+      "react-refresh/only-export-components": [
+        "warn",
+        { allowConstantExport: true },
+      ],
+    },
+  },
+];

index.html ADDED Viewed

	@@ -0,0 +1,13 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/png" href="/logo.png" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Transformers.js | Speech-to-speech demo</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.jsx"></script>
+  </body>
+</html>

package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

package.json CHANGED Viewed

@@ -1,39 +1,32 @@
 {
-  "name": "react-template",
-  "version": "0.1.0",
   "private": true,
-  "dependencies": {
-    "@testing-library/dom": "^10.4.0",
-    "@testing-library/jest-dom": "^6.6.3",
-    "@testing-library/react": "^16.3.0",
-    "@testing-library/user-event": "^13.5.0",
-    "react": "^19.1.0",
-    "react-dom": "^19.1.0",
-    "react-scripts": "5.0.1",
-    "web-vitals": "^2.1.4"
-  },
   "scripts": {
-    "start": "react-scripts start",
-    "build": "react-scripts build",
-    "test": "react-scripts test",
-    "eject": "react-scripts eject"
   },
-  "eslintConfig": {
-    "extends": [
-      "react-app",
-      "react-app/jest"
-    ]
   },
-  "browserslist": {
-    "production": [
-      ">0.2%",
-      "not dead",
-      "not op_mini all"
-    ],
-    "development": [
-      "last 1 chrome version",
-      "last 1 firefox version",
-      "last 1 safari version"
-    ]
   }
 }

 {
+  "name": "speech-to-speech",
   "private": true,
+  "version": "0.0.0",
+  "type": "module",
   "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "lint": "eslint .",
+    "preview": "vite preview"
   },
+  "dependencies": {
+    "@huggingface/transformers": "^3.5.2",
+    "@tailwindcss/vite": "^4.1.4",
+    "kokoro-js": "^1.2.1",
+    "lucide-react": "^0.503.0",
+    "react": "^19.0.0",
+    "react-dom": "^19.0.0",
+    "tailwindcss": "^4.1.4"
   },
+  "devDependencies": {
+    "@eslint/js": "^9.22.0",
+    "@types/react": "^19.0.10",
+    "@types/react-dom": "^19.0.4",
+    "@vitejs/plugin-react": "^4.3.4",
+    "eslint": "^9.22.0",
+    "eslint-plugin-react-hooks": "^5.2.0",
+    "eslint-plugin-react-refresh": "^0.4.19",
+    "globals": "^16.0.0",
+    "vite": "^6.3.1"
   }
 }

public/logo.png ADDED Viewed

Git LFS Details

SHA256: 9fb2bd90d1eeab88414681bb80464bc723ab57fb2c5b2f33367f16a0157ed5c0
Pointer size: 131 Bytes
Size of remote file: 634 kB

src/App.jsx ADDED Viewed

	@@ -0,0 +1,367 @@

+import { useEffect, useState, useRef } from "react";
+import { Mic, PhoneOff, ChevronDown } from "lucide-react";
+import { INPUT_SAMPLE_RATE } from "./constants";
+import WORKLET from "./play-worklet.js";
+export default function App() {
+  const [callStartTime, setCallStartTime] = useState(null);
+  const [callStarted, setCallStarted] = useState(false);
+  const [playing, setPlaying] = useState(false);
+  const [voice, setVoice] = useState("af_heart");
+  const [voices, setVoices] = useState([]);
+  const [isListening, setIsListening] = useState(false);
+  const [isSpeaking, setIsSpeaking] = useState(false);
+  const [listeningScale, setListeningScale] = useState(1);
+  const [speakingScale, setSpeakingScale] = useState(1);
+  const [ripples, setRipples] = useState([]);
+  const [ready, setReady] = useState(false);
+  const [error, setError] = useState(null);
+  const [elapsedTime, setElapsedTime] = useState("00:00");
+  const worker = useRef(null);
+  const node = useRef(null);
+  useEffect(() => {
+    worker.current?.postMessage({
+      type: "set_voice",
+      voice,
+    });
+  }, [voice]);
+  useEffect(() => {
+    if (!callStarted) {
+      // Reset worker state after call ends
+      worker.current?.postMessage({
+        type: "end_call",
+      });
+    }
+  }, [callStarted]);
+  useEffect(() => {
+    if (callStarted && callStartTime) {
+      const interval = setInterval(() => {
+        const diff = Math.floor((Date.now() - callStartTime) / 1000);
+        const minutes = String(Math.floor(diff / 60)).padStart(2, "0");
+        const seconds = String(diff % 60).padStart(2, "0");
+        setElapsedTime(`${minutes}:${seconds}`);
+      }, 1000);
+      return () => clearInterval(interval);
+    } else {
+      setElapsedTime("00:00");
+    }
+  }, [callStarted, callStartTime]);
+  useEffect(() => {
+    worker.current ??= new Worker(new URL("./worker.js", import.meta.url), {
+      type: "module",
+    });
+    const onMessage = ({ data }) => {
+      console.log("Worker message:", data);
+      if (data.error) {
+        return onError(data.error);
+      }
+      switch (data.type) {
+        case "status":
+          if (data.status === "recording_start") {
+            setIsListening(true);
+            setIsSpeaking(false);
+          } else if (data.status === "recording_end") {
+            setIsListening(false);
+          } else if (data.status === "ready") {
+            setVoices(data.voices);
+            setReady(true);
+          }
+          break;
+        case "output":
+          if (!playing) {
+            node.current?.port.postMessage(data.result.audio);
+            setPlaying(true);
+            setIsSpeaking(true);
+            setIsListening(false);
+          }
+          break;
+      }
+    };
+    const onError = (err) => setError(err.message);
+    worker.current.addEventListener("message", onMessage);
+    worker.current.addEventListener("error", onError);
+    return () => {
+      worker.current.removeEventListener("message", onMessage);
+      worker.current.removeEventListener("error", onError);
+    };
+  }, []);
+  useEffect(() => {
+    if (!callStarted) return;
+    let worklet;
+    let inputAudioContext;
+    let source;
+    let ignore = false;
+    let outputAudioContext;
+    const audioStreamPromise = navigator.mediaDevices.getUserMedia({
+      audio: {
+        channelCount: 1,
+        echoCancellation: true,
+        autoGainControl: true,
+        noiseSuppression: true,
+        sampleRate: INPUT_SAMPLE_RATE,
+      },
+    });
+    audioStreamPromise
+      .then(async (stream) => {
+        if (ignore) return;
+        inputAudioContext = new (window.AudioContext ||
+          window.webkitAudioContext)({
+          sampleRate: INPUT_SAMPLE_RATE,
+        });
+        const analyser = inputAudioContext.createAnalyser();
+        analyser.fftSize = 256;
+        source = inputAudioContext.createMediaStreamSource(stream);
+        source.connect(analyser);
+        const inputDataArray = new Uint8Array(analyser.frequencyBinCount);
+        function calculateRMS(array) {
+          let sum = 0;
+          for (let i = 0; i < array.length; ++i) {
+            const normalized = array[i] / 128 - 1;
+            sum += normalized * normalized;
+          }
+          const rms = Math.sqrt(sum / array.length);
+          return rms;
+        }
+        await inputAudioContext.audioWorklet.addModule(
+          new URL("./vad-processor.js", import.meta.url),
+        );
+        worklet = new AudioWorkletNode(inputAudioContext, "vad-processor", {
+          numberOfInputs: 1,
+          numberOfOutputs: 0,
+          channelCount: 1,
+          channelCountMode: "explicit",
+          channelInterpretation: "discrete",
+        });
+        source.connect(worklet);
+        worklet.port.onmessage = (event) => {
+          const { buffer } = event.data;
+          worker.current?.postMessage({ type: "audio", buffer });
+        };
+        outputAudioContext = new AudioContext({
+          sampleRate: 24000,
+        });
+        outputAudioContext.resume();
+        const blob = new Blob([`(${WORKLET.toString()})()`], {
+          type: "application/javascript",
+        });
+        const url = URL.createObjectURL(blob);
+        await outputAudioContext.audioWorklet.addModule(url);
+        URL.revokeObjectURL(url);
+        node.current = new AudioWorkletNode(
+          outputAudioContext,
+          "buffered-audio-worklet-processor",
+        );
+        node.current.port.onmessage = (event) => {
+          if (event.data.type === "playback_ended") {
+            setPlaying(false);
+            setIsSpeaking(false);
+            worker.current?.postMessage({ type: "playback_ended" });
+          }
+        };
+        const outputAnalyser = outputAudioContext.createAnalyser();
+        outputAnalyser.fftSize = 256;
+        node.current.connect(outputAnalyser);
+        outputAnalyser.connect(outputAudioContext.destination);
+        const outputDataArray = new Uint8Array(
+          outputAnalyser.frequencyBinCount,
+        );
+        function updateVisualizers() {
+          analyser.getByteTimeDomainData(inputDataArray);
+          const rms = calculateRMS(inputDataArray);
+          const targetScale = 1 + Math.min(1.25 * rms, 0.25);
+          setListeningScale((prev) => prev + (targetScale - prev) * 0.25);
+          outputAnalyser.getByteTimeDomainData(outputDataArray);
+          const outputRMS = calculateRMS(outputDataArray);
+          const targetOutputScale = 1 + Math.min(1.25 * outputRMS, 0.25);
+          setSpeakingScale((prev) => prev + (targetOutputScale - prev) * 0.25);
+          requestAnimationFrame(updateVisualizers);
+        }
+        updateVisualizers();
+      })
+      .catch((err) => {
+        setError(err.message);
+        console.error(err);
+      });
+    return () => {
+      ignore = true;
+      audioStreamPromise.then((stream) =>
+        stream.getTracks().forEach((track) => track.stop()),
+      );
+      source?.disconnect();
+      worklet?.disconnect();
+      inputAudioContext?.close();
+      outputAudioContext?.close();
+    };
+  }, [callStarted]);
+  useEffect(() => {
+    if (!callStarted) return;
+    const interval = setInterval(() => {
+      const id = Date.now();
+      setRipples((prev) => [...prev, id]);
+      setTimeout(() => {
+        setRipples((prev) => prev.filter((r) => r !== id));
+      }, 1500);
+    }, 1000);
+    return () => clearInterval(interval);
+  }, [callStarted]);
+  return (
+    <div className="h-screen min-h-[240px] flex items-center justify-center bg-gray-50 p-4 relative">
+      <div className="h-full max-h-[320px] w-[640px] bg-white rounded-xl shadow-lg p-8 flex items-center justify-between space-x-16">
+        <div className="text-green-700 w-[140px]">
+          <div className="text-xl font-bold flex justify-between">
+            {voices?.[voice]?.name}
+            <span className="font-normal text-gray-500">{elapsedTime}</span>
+          </div>
+          <div className="text-base relative">
+            <button
+              type="button"
+              disabled={!ready}
+              className={`w-full flex items-center justify-between border border-gray-300 rounded-md transition-colors ${
+                ready
+                  ? "bg-transparent hover:border-gray-400"
+                  : "bg-gray-100 opacity-50 cursor-not-allowed"
+              }`}
+            >
+              <span className="px-2 py-1">Select voice</span>
+              <ChevronDown className="absolute right-2" />
+            </button>
+            <select
+              value={voice}
+              onChange={(e) => setVoice(e.target.value)}
+              className="absolute inset-0 opacity-0 cursor-pointer"
+              disabled={!ready}
+            >
+              {Object.entries(voices).map(([key, v]) => (
+                <option key={key} value={key}>
+                  {`${v.name} (${
+                    v.language === "en-us" ? "American" : v.language
+                  } ${v.gender})`}
+                </option>
+              ))}
+            </select>
+          </div>
+        </div>
+        <div className="relative flex items-center justify-center w-32 h-32 flex-shrink-0 aspect-square">
+          {callStarted &&
+            ripples.map((id) => (
+              <div
+                key={id}
+                className="absolute inset-0 rounded-full border-2 border-green-200 pointer-events-none"
+                style={{ animation: "ripple 1.5s ease-out forwards" }}
+              />
+            ))}
+          <div className="absolute z-10 text-lg text-gray-700">
+            {!ready ? "Loading..." : ""}
+            {isListening && "Listening..."}
+            {isSpeaking && "Speaking..."}
+          </div>
+          {/* Pulsing loader while initializing */}
+          <div
+            className={`absolute w-32 h-32 rounded-full bg-green-200 ${
+              !ready ? "animate-ping opacity-75" : ""
+            }`}
+            style={{ animationDuration: "1.5s" }}
+          />
+          {/* Main rings */}
+          <div
+            className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-300 ${
+              !ready ? "opacity-0" : ""
+            }`}
+            style={{ transform: `scale(${speakingScale})` }}
+          />
+          <div
+            className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-200 ${
+              !ready ? "opacity-0" : ""
+            }`}
+            style={{ transform: `scale(${listeningScale})` }}
+          />
+        </div>
+        <div className="space-y-4 w-[140px]">
+          {callStarted ? (
+            <button
+              className="flex items-center space-x-2 px-4 py-2 bg-red-100 text-red-700 rounded-md hover:bg-red-200"
+              onClick={() => {
+                setCallStarted(false);
+                setCallStartTime(null);
+                setPlaying(false);
+                setIsListening(false);
+                setIsSpeaking(false);
+              }}
+            >
+              <PhoneOff className="w-5 h-5" />
+              <span>End call</span>
+            </button>
+          ) : (
+            <button
+              className={`flex items-center space-x-2 px-4 py-2 rounded-md ${
+                ready
+                  ? "bg-blue-100 text-blue-700 hover:bg-blue-200"
+                  : "bg-blue-100 text-blue-700 opacity-50 cursor-not-allowed"
+              }`}
+              onClick={() => {
+                setCallStartTime(Date.now());
+                setCallStarted(true);
+                worker.current?.postMessage({ type: "start_call" });
+              }}
+              disabled={!ready}
+            >
+              <span>Start call</span>
+            </button>
+          )}
+        </div>
+      </div>
+      <div className="absolute bottom-4 text-sm">
+        Built with{" "}
+        <a
+          href="https://github.com/huggingface/transformers.js"
+          rel="noopener noreferrer"
+          target="_blank"
+          className="text-blue-600 hover:underline"
+        >
+          🤗 Transformers.js
+        </a>
+      </div>
+    </div>
+  );
+}

src/constants.js ADDED Viewed

	@@ -0,0 +1,53 @@

+/**
+ * Sample rate of the input audio.
+ * Coindicentally, this is the same for both models (Moonshine and Silero VAD)
+ */
+export const INPUT_SAMPLE_RATE = 16000;
+const INPUT_SAMPLE_RATE_MS = INPUT_SAMPLE_RATE / 1000;
+/**
+ * Probabilities ABOVE this value are considered as SPEECH
+ */
+export const SPEECH_THRESHOLD = 0.3;
+/**
+ * If current state is SPEECH, and the probability of the next state
+ * is below this value, it is considered as NON-SPEECH.
+ */
+export const EXIT_THRESHOLD = 0.1;
+/**
+ * After each speech chunk, wait for at least this amount of silence
+ * before considering the next chunk as a new speech chunk
+ */
+export const MIN_SILENCE_DURATION_MS = 400;
+export const MIN_SILENCE_DURATION_SAMPLES =
+  MIN_SILENCE_DURATION_MS * INPUT_SAMPLE_RATE_MS;
+/**
+ * Pad the speech chunk with this amount each side
+ */
+export const SPEECH_PAD_MS = 80;
+export const SPEECH_PAD_SAMPLES = SPEECH_PAD_MS * INPUT_SAMPLE_RATE_MS;
+/**
+ * Final speech chunks below this duration are discarded
+ */
+export const MIN_SPEECH_DURATION_SAMPLES = 250 * INPUT_SAMPLE_RATE_MS; // 250 ms
+/**
+ * Maximum duration of audio that can be handled by Moonshine
+ */
+export const MAX_BUFFER_DURATION = 30;
+/**
+ * Size of the incoming buffers
+ */
+export const NEW_BUFFER_SIZE = 512;
+/**
+ * The number of previous buffers to keep, to ensure the audio is padded correctly
+ */
+export const MAX_NUM_PREV_BUFFERS = Math.ceil(
+  SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE,
+);

src/index.css ADDED Viewed

	@@ -0,0 +1,12 @@

+@import "tailwindcss";
+@keyframes ripple {
+  from {
+    transform: scale(1);
+    opacity: 0.7;
+  }
+  to {
+    transform: scale(2);
+    opacity: 0;
+  }
+}

src/main.jsx ADDED Viewed

	@@ -0,0 +1,10 @@

+import { StrictMode } from "react";
+import { createRoot } from "react-dom/client";
+import "./index.css";
+import App from "./App.jsx";
+createRoot(document.getElementById("root")).render(
+  <StrictMode>
+    <App />
+  </StrictMode>,
+);

src/play-worklet.js ADDED Viewed

	@@ -0,0 +1,73 @@

+export default () => {
+  class BufferedAudioWorkletProcessor extends AudioWorkletProcessor {
+    constructor() {
+      super();
+      this.bufferQueue = [];
+      this.currentChunkOffset = 0;
+      this.hadData = false;
+      this.port.onmessage = (event) => {
+        const data = event.data;
+        if (data instanceof Float32Array) {
+          this.hadData = true;
+          this.bufferQueue.push(data);
+        } else if (data === "stop") {
+          this.bufferQueue = [];
+          this.currentChunkOffset = 0;
+        }
+      };
+    }
+    process(inputs, outputs) {
+      const channel = outputs[0][0];
+      if (!channel) return true;
+      const numSamples = channel.length;
+      let outputIndex = 0;
+      if (this.hadData && this.bufferQueue.length === 0) {
+        this.port.postMessage({ type: "playback_ended" });
+        this.hadData = false;
+      }
+      while (outputIndex < numSamples) {
+        if (this.bufferQueue.length > 0) {
+          const currentChunk = this.bufferQueue[0];
+          const remainingSamples =
+            currentChunk.length - this.currentChunkOffset;
+          const samplesToCopy = Math.min(
+            remainingSamples,
+            numSamples - outputIndex,
+          );
+          channel.set(
+            currentChunk.subarray(
+              this.currentChunkOffset,
+              this.currentChunkOffset + samplesToCopy,
+            ),
+            outputIndex,
+          );
+          this.currentChunkOffset += samplesToCopy;
+          outputIndex += samplesToCopy;
+          // Remove the chunk if fully consumed.
+          if (this.currentChunkOffset >= currentChunk.length) {
+            this.bufferQueue.shift();
+            this.currentChunkOffset = 0;
+          }
+        } else {
+          // If no data is available, fill the rest of the buffer with silence.
+          channel.fill(0, outputIndex);
+          outputIndex = numSamples;
+        }
+      }
+      return true;
+    }
+  }
+  registerProcessor(
+    "buffered-audio-worklet-processor",
+    BufferedAudioWorkletProcessor,
+  );
+};

src/vad-processor.js ADDED Viewed

	@@ -0,0 +1,37 @@

+const MIN_CHUNK_SIZE = 512;
+let globalPointer = 0;
+let globalBuffer = new Float32Array(MIN_CHUNK_SIZE);
+class VADProcessor extends AudioWorkletProcessor {
+  process(inputs, outputs, parameters) {
+    const buffer = inputs[0][0];
+    if (!buffer) return; // buffer is null when the stream ends
+    if (buffer.length > MIN_CHUNK_SIZE) {
+      // If the buffer is larger than the minimum chunk size, send the entire buffer
+      this.port.postMessage({ buffer });
+    } else {
+      const remaining = MIN_CHUNK_SIZE - globalPointer;
+      if (buffer.length >= remaining) {
+        // If the buffer is larger than (or equal to) the remaining space in the global buffer, copy the remaining space
+        globalBuffer.set(buffer.subarray(0, remaining), globalPointer);
+        // Send the global buffer
+        this.port.postMessage({ buffer: globalBuffer });
+        // Reset the global buffer and set the remaining buffer
+        globalBuffer.fill(0);
+        globalBuffer.set(buffer.subarray(remaining), 0);
+        globalPointer = buffer.length - remaining;
+      } else {
+        // If the buffer is smaller than the remaining space in the global buffer, copy the buffer to the global buffer
+        globalBuffer.set(buffer, globalPointer);
+        globalPointer += buffer.length;
+      }
+    }
+    return true; // Keep the processor alive
+  }
+}
+registerProcessor("vad-processor", VADProcessor);

src/worker.js ADDED Viewed

	@@ -0,0 +1,355 @@

+import {
+  // VAD
+  AutoModel,
+  // LLM
+  AutoTokenizer,
+  AutoModelForCausalLM,
+  TextStreamer,
+  InterruptableStoppingCriteria,
+  // Speech recognition
+  Tensor,
+  pipeline,
+} from "@huggingface/transformers";
+import { KokoroTTS, TextSplitterStream } from "kokoro-js";
+import {
+  MAX_BUFFER_DURATION,
+  INPUT_SAMPLE_RATE,
+  SPEECH_THRESHOLD,
+  EXIT_THRESHOLD,
+  SPEECH_PAD_SAMPLES,
+  MAX_NUM_PREV_BUFFERS,
+  MIN_SILENCE_DURATION_SAMPLES,
+  MIN_SPEECH_DURATION_SAMPLES,
+} from "./constants";
+const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
+let voice;
+const tts = await KokoroTTS.from_pretrained(model_id, {
+  dtype: "fp32",
+  device: "webgpu",
+});
+const device = "webgpu";
+self.postMessage({ type: "info", message: `Using device: "${device}"` });
+self.postMessage({
+  type: "info",
+  message: "Loading models...",
+  duration: "until_next",
+});
+// Load models
+const silero_vad = await AutoModel.from_pretrained(
+  "onnx-community/silero-vad",
+  {
+    config: { model_type: "custom" },
+    dtype: "fp32", // Full-precision
+  },
+).catch((error) => {
+  self.postMessage({ error });
+  throw error;
+});
+const DEVICE_DTYPE_CONFIGS = {
+  webgpu: {
+    encoder_model: "fp32",
+    decoder_model_merged: "fp32",
+  },
+  wasm: {
+    encoder_model: "fp32",
+    decoder_model_merged: "q8",
+  },
+};
+const transcriber = await pipeline(
+  "automatic-speech-recognition",
+  "onnx-community/whisper-base", // or "onnx-community/moonshine-base-ONNX",
+  {
+    device,
+    dtype: DEVICE_DTYPE_CONFIGS[device],
+  },
+).catch((error) => {
+  self.postMessage({ error });
+  throw error;
+});
+await transcriber(new Float32Array(INPUT_SAMPLE_RATE)); // Compile shaders
+const llm_model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct";
+const tokenizer = await AutoTokenizer.from_pretrained(llm_model_id);
+const llm = await AutoModelForCausalLM.from_pretrained(llm_model_id, {
+  dtype: "q4f16",
+  device: "webgpu",
+});
+const SYSTEM_MESSAGE = {
+  role: "system",
+  content:
+    "You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual.",
+};
+await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 }); // Compile shaders
+let messages = [SYSTEM_MESSAGE];
+let past_key_values_cache;
+let stopping_criteria;
+self.postMessage({
+  type: "status",
+  status: "ready",
+  message: "Ready!",
+  voices: tts.voices,
+});
+// Global audio buffer to store incoming audio
+const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
+let bufferPointer = 0;
+// Initial state for VAD
+const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
+let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
+// Whether we are in the process of adding audio to the buffer
+let isRecording = false;
+let isPlaying = false; // new flag
+/**
+ * Perform Voice Activity Detection (VAD)
+ * @param {Float32Array} buffer The new audio buffer
+ * @returns {Promise<boolean>} `true` if the buffer is speech, `false` otherwise.
+ */
+async function vad(buffer) {
+  const input = new Tensor("float32", buffer, [1, buffer.length]);
+  const { stateN, output } = await silero_vad({ input, sr, state });
+  state = stateN; // Update state
+  const isSpeech = output.data[0];
+  // Use heuristics to determine if the buffer is speech or not
+  return (
+    // Case 1: We are above the threshold (definitely speech)
+    isSpeech > SPEECH_THRESHOLD ||
+    // Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
+    (isRecording && isSpeech >= EXIT_THRESHOLD)
+  );
+}
+/**
+ * Transcribe the audio buffer
+ * @param {Float32Array} buffer The audio buffer
+ * @param {Object} data Additional data
+ */
+const speechToSpeech = async (buffer, data) => {
+  isPlaying = true;
+  // 1. Transcribe the audio from the user
+  const text = await transcriber(buffer).then(({ text }) => text.trim());
+  if (["", "[BLANK_AUDIO]"].includes(text)) {
+    // If the transcription is empty or a blank audio, we skip the rest of the processing
+    return;
+  }
+  messages.push({ role: "user", content: text });
+  // Set up text-to-speech streaming
+  const splitter = new TextSplitterStream();
+  const stream = tts.stream(splitter, {
+    voice,
+  });
+  (async () => {
+    for await (const { text, phonemes, audio } of stream) {
+      self.postMessage({ type: "output", text, result: audio });
+    }
+  })();
+  // 2. Generate a response using the LLM
+  const inputs = tokenizer.apply_chat_template(messages, {
+    add_generation_prompt: true,
+    return_dict: true,
+  });
+  const streamer = new TextStreamer(tokenizer, {
+    skip_prompt: true,
+    skip_special_tokens: true,
+    callback_function: (text) => {
+      splitter.push(text);
+    },
+    token_callback_function: () => {},
+  });
+  stopping_criteria = new InterruptableStoppingCriteria();
+  const { past_key_values, sequences } = await llm.generate({
+    ...inputs,
+    past_key_values: past_key_values_cache,
+    do_sample: false, // TODO: do_sample: true is bugged (invalid data location on topk sample)
+    max_new_tokens: 1024,
+    streamer,
+    stopping_criteria,
+    return_dict_in_generate: true,
+  });
+  past_key_values_cache = past_key_values;
+  // Finally, close the stream to signal that no more text will be added.
+  splitter.close();
+  const decoded = tokenizer.batch_decode(
+    sequences.slice(null, [inputs.input_ids.dims[1], null]),
+    { skip_special_tokens: true },
+  );
+  messages.push({ role: "assistant", content: decoded[0] });
+};
+// Track the number of samples after the last speech chunk
+let postSpeechSamples = 0;
+const resetAfterRecording = (offset = 0) => {
+  self.postMessage({
+    type: "status",
+    status: "recording_end",
+    message: "Transcribing...",
+    duration: "until_next",
+  });
+  BUFFER.fill(0, offset);
+  bufferPointer = offset;
+  isRecording = false;
+  postSpeechSamples = 0;
+};
+const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
+  // Get start and end time of the speech segment, minus the padding
+  const now = Date.now();
+  const end =
+    now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
+  const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
+  const duration = end - start;
+  const overflowLength = overflow?.length ?? 0;
+  // Send the audio buffer to the worker
+  const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
+  const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
+  const paddedBuffer = new Float32Array(prevLength + buffer.length);
+  let offset = 0;
+  for (const prev of prevBuffers) {
+    paddedBuffer.set(prev, offset);
+    offset += prev.length;
+  }
+  paddedBuffer.set(buffer, offset);
+  speechToSpeech(paddedBuffer, { start, end, duration });
+  // Set overflow (if present) and reset the rest of the audio buffer
+  if (overflow) {
+    BUFFER.set(overflow, 0);
+  }
+  resetAfterRecording(overflowLength);
+};
+let prevBuffers = [];
+self.onmessage = async (event) => {
+  const { type, buffer } = event.data;
+  // refuse new audio while playing back
+  if (type === "audio" && isPlaying) return;
+  switch (type) {
+    case "start_call": {
+      const name = tts.voices[voice ?? "af_heart"]?.name ?? "Heart";
+      greet(`Hey there, my name is ${name}! How can I help you today?`);
+      return;
+    }
+    case "end_call":
+      messages = [SYSTEM_MESSAGE];
+      past_key_values_cache = null;
+    case "interrupt":
+      stopping_criteria?.interrupt();
+      return;
+    case "set_voice":
+      voice = event.data.voice;
+      return;
+    case "playback_ended":
+      isPlaying = false;
+      return;
+  }
+  const wasRecording = isRecording; // Save current state
+  const isSpeech = await vad(buffer);
+  if (!wasRecording && !isSpeech) {
+    // We are not recording, and the buffer is not speech,
+    // so we will probably discard the buffer. So, we insert
+    // into a FIFO queue with maximum size of PREV_BUFFER_SIZE
+    if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
+      // If the queue is full, we discard the oldest buffer
+      prevBuffers.shift();
+    }
+    prevBuffers.push(buffer);
+    return;
+  }
+  const remaining = BUFFER.length - bufferPointer;
+  if (buffer.length >= remaining) {
+    // The buffer is larger than (or equal to) the remaining space in the global buffer,
+    // so we perform transcription and copy the overflow to the global buffer
+    BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
+    bufferPointer += remaining;
+    // Dispatch the audio buffer
+    const overflow = buffer.subarray(remaining);
+    dispatchForTranscriptionAndResetAudioBuffer(overflow);
+    return;
+  } else {
+    // The buffer is smaller than the remaining space in the global buffer,
+    // so we copy it to the global buffer
+    BUFFER.set(buffer, bufferPointer);
+    bufferPointer += buffer.length;
+  }
+  if (isSpeech) {
+    if (!isRecording) {
+      // Indicate start of recording
+      self.postMessage({
+        type: "status",
+        status: "recording_start",
+        message: "Listening...",
+        duration: "until_next",
+      });
+    }
+    // Start or continue recording
+    isRecording = true;
+    postSpeechSamples = 0; // Reset the post-speech samples
+    return;
+  }
+  postSpeechSamples += buffer.length;
+  // At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
+  // So, we check whether we have reached the end of the current audio chunk.
+  if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
+    // There was a short pause, but not long enough to consider the end of a speech chunk
+    // (e.g., the speaker took a breath), so we continue recording
+    return;
+  }
+  if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
+    // The entire buffer (including the new chunk) is smaller than the minimum
+    // duration of a speech chunk, so we can safely discard the buffer.
+    resetAfterRecording();
+    return;
+  }
+  dispatchForTranscriptionAndResetAudioBuffer();
+};
+function greet(text) {
+  isPlaying = true;
+  const splitter = new TextSplitterStream();
+  const stream = tts.stream(splitter, { voice });
+  (async () => {
+    for await (const { text: chunkText, audio } of stream) {
+      self.postMessage({ type: "output", text: chunkText, result: audio });
+    }
+  })();
+  splitter.push(text);
+  splitter.close();
+  messages.push({ role: "assistant", content: text });
+}

vite.config.js ADDED Viewed

	@@ -0,0 +1,19 @@

+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react";
+import tailwindcss from "@tailwindcss/vite";
+// https://vite.dev/config/
+export default defineConfig({
+  plugins: [tailwindcss(), react()],
+  build: {
+    target: "esnext",
+  },
+  worker: {
+    format: "es",
+  },
+  resolve: {
+    // Only bundle a single instance of Transformers.js
+    // (shared by `@huggingface/transformers` and `kokoro-js`)
+    dedupe: ["@huggingface/transformers"],
+  },
+});