Commit
·
2160235
1
Parent(s):
56c75a3
release: build 60371bd7e5a9f74f023fc8c57fca5cced4b0e47b
Browse files- .gitattributes +2 -0
- README.md +65 -5
- assets/index-DGmKQH7N.js +0 -0
- assets/index-cAxkOY9l.css +1 -0
- assets/play-worklet-CqUYQx_r.js +1 -0
- assets/vad-processor-0sEQXaXZ.js +1 -0
- assets/worker-yoCrhISy.ts +465 -0
- favicon-96x96.png +0 -0
- favicon.svg +8 -0
- index.html +22 -17
- style.css +0 -28
.gitattributes
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
|
2 |
+
# Default
|
3 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
4 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
5 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,10 +1,70 @@
|
|
1 |
---
|
2 |
-
|
3 |
-
emoji: ⚡
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: green
|
6 |
sdk: static
|
7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
---
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
emoji: "\U0001F4AC"
|
|
|
|
|
|
|
3 |
sdk: static
|
4 |
pinned: false
|
5 |
+
license: mit
|
6 |
+
title: Realtime Conversational WebGPU (Vue)
|
7 |
+
colorFrom: purple
|
8 |
+
colorTo: indigo
|
9 |
+
models:
|
10 |
+
- HuggingFaceTB/SmolLM2-1.7B-Instruct
|
11 |
+
- onnx-community/whisper-base
|
12 |
+
- onnx-community/silero-vad
|
13 |
+
short_description: Yet another Realtime Conversational WebGPU
|
14 |
---
|
15 |
|
16 |
+
<h1 align="center">Realtime Conversational WebGPU (Vue)</h1>
|
17 |
+
|
18 |
+
<p align="center">
|
19 |
+
[<a href="https://conversational-webgpu-vue.netlify.app/">Try it</a>]
|
20 |
+
</p>
|
21 |
+
|
22 |
+
> Heavily inspired by [WebGPU Video Object Detection - a Hugging Face Space by WebML Community](https://huggingface.co/spaces/webml-community/webgpu-video-object-detection)
|
23 |
+
|
24 |
+
# Realtime Conversational WebGPU
|
25 |
+
|
26 |
+
## Getting Started
|
27 |
+
|
28 |
+
Follow the steps below to set up and run the application.
|
29 |
+
|
30 |
+
### 1. Clone the Repository
|
31 |
+
|
32 |
+
Clone the examples repository from GitHub:
|
33 |
+
|
34 |
+
```sh
|
35 |
+
git clone https://github.com/proj-airi/webai-examples.git
|
36 |
+
```
|
37 |
+
|
38 |
+
### 2. Navigate to the Project Directory
|
39 |
+
|
40 |
+
Change your working directory to the `conversational-webgpu` folder:
|
41 |
+
|
42 |
+
```sh
|
43 |
+
cd apps/conversational-webgpu
|
44 |
+
```
|
45 |
+
|
46 |
+
### 3. Install Dependencies
|
47 |
+
|
48 |
+
Install the necessary dependencies using npm:
|
49 |
+
|
50 |
+
```sh
|
51 |
+
npm i
|
52 |
+
```
|
53 |
+
|
54 |
+
### 4. Run the Development Server
|
55 |
+
|
56 |
+
Start the development server:
|
57 |
+
|
58 |
+
```sh
|
59 |
+
npm run dev
|
60 |
+
```
|
61 |
+
|
62 |
+
The application should now be running locally. Open your browser and go to `http://localhost:5175` to see it in action.
|
63 |
+
|
64 |
+
## Acknowledgements
|
65 |
+
|
66 |
+
Great thanks to what WebML Community have done.
|
67 |
+
|
68 |
+
> [Source code](https://huggingface.co/spaces/webml-community/conversational-webgpu)
|
69 |
+
|
70 |
+
> [UI inspiration](https://app.sesame.com/)
|
assets/index-DGmKQH7N.js
ADDED
The diff for this file is too large to render.
See raw diff
|
|
assets/index-cAxkOY9l.css
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
@keyframes ripple-314fbe9f{0%{transform:scale(1);opacity:.7}to{transform:scale(2);opacity:0}}.embla[data-v-314fbe9f]{position:relative;overflow:hidden}.embla[data-v-314fbe9f]:before,.embla[data-v-314fbe9f]:after{content:"";position:absolute;top:0;bottom:0;width:48px;z-index:1;pointer-events:none}.embla-edge-disabled.embla[data-v-314fbe9f]:before,.embla-edge-disabled.embla[data-v-314fbe9f]:after{display:none}.embla[data-v-314fbe9f]:before{left:-24px;background:linear-gradient(to right,#ffffff 32px,transparent)}.embla[data-v-314fbe9f]:after{right:-24px;background:linear-gradient(to left,#ffffff 32px,transparent)}.dark .embla[data-v-314fbe9f]:before{left:-24px;background:linear-gradient(to right,#121212 32px,transparent)}.dark .embla[data-v-314fbe9f]:after{right:-24px;background:linear-gradient(to left,#121212 32px,transparent)}.fade-enter-active[data-v-314fbe9f],.fade-leave-active[data-v-314fbe9f]{transition:opacity .5s ease}.fade-enter-from[data-v-314fbe9f],.fade-leave-to[data-v-314fbe9f]{opacity:0}.fade-enter-to[data-v-314fbe9f],.fade-leave-from[data-v-314fbe9f]{opacity:1}.fade-scale-enter-active[data-v-314fbe9f],.fade-scale-leave-active[data-v-314fbe9f]{transition:all .2s ease-in-out}.fade-scale-enter-from[data-v-314fbe9f],.fade-scale-leave-to[data-v-314fbe9f]{opacity:0;transform:scale(.8)}.fade-scale-enter-to[data-v-314fbe9f],.fade-scale-leave-from[data-v-314fbe9f]{opacity:1;transform:scale(1)}*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:var(--un-default-border-color, #e5e7eb)}:before,:after{--un-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,[type=button],[type=reset],[type=submit]{-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}:root{--bg-color-light: rgb(255 255 255);--bg-color-dark: rgb(18 18 18);--bg-color: var(--bg-color-light)}html,body,#app{height:100%;margin:0;padding:0;overscroll-behavior:none}html{background:var(--bg-color);transition:all .3s ease-in-out}html.dark{--bg-color: var(--bg-color-dark);color-scheme:dark}*,:before,:after{--un-rotate:0;--un-rotate-x:0;--un-rotate-y:0;--un-rotate-z:0;--un-scale-x:1;--un-scale-y:1;--un-scale-z:1;--un-skew-x:0;--un-skew-y:0;--un-translate-x:0;--un-translate-y:0;--un-translate-z:0;--un-pan-x: ;--un-pan-y: ;--un-pinch-zoom: ;--un-scroll-snap-strictness:proximity;--un-ordinal: ;--un-slashed-zero: ;--un-numeric-figure: ;--un-numeric-spacing: ;--un-numeric-fraction: ;--un-border-spacing-x:0;--un-border-spacing-y:0;--un-ring-offset-shadow:0 0 rgb(0 0 0 / 0);--un-ring-shadow:0 0 rgb(0 0 0 / 0);--un-shadow-inset: ;--un-shadow:0 0 rgb(0 0 0 / 0);--un-ring-inset: ;--un-ring-offset-width:0px;--un-ring-offset-color:#fff;--un-ring-width:0px;--un-ring-color:rgb(147 197 253 / .5);--un-blur: ;--un-brightness: ;--un-contrast: ;--un-drop-shadow: ;--un-grayscale: ;--un-hue-rotate: ;--un-invert: ;--un-saturate: ;--un-sepia: ;--un-backdrop-blur: ;--un-backdrop-brightness: ;--un-backdrop-contrast: ;--un-backdrop-grayscale: ;--un-backdrop-hue-rotate: ;--un-backdrop-invert: ;--un-backdrop-opacity: ;--un-backdrop-saturate: ;--un-backdrop-sepia: }::backdrop{--un-rotate:0;--un-rotate-x:0;--un-rotate-y:0;--un-rotate-z:0;--un-scale-x:1;--un-scale-y:1;--un-scale-z:1;--un-skew-x:0;--un-skew-y:0;--un-translate-x:0;--un-translate-y:0;--un-translate-z:0;--un-pan-x: ;--un-pan-y: ;--un-pinch-zoom: ;--un-scroll-snap-strictness:proximity;--un-ordinal: ;--un-slashed-zero: ;--un-numeric-figure: ;--un-numeric-spacing: ;--un-numeric-fraction: ;--un-border-spacing-x:0;--un-border-spacing-y:0;--un-ring-offset-shadow:0 0 rgb(0 0 0 / 0);--un-ring-shadow:0 0 rgb(0 0 0 / 0);--un-shadow-inset: ;--un-shadow:0 0 rgb(0 0 0 / 0);--un-ring-inset: ;--un-ring-offset-width:0px;--un-ring-offset-color:#fff;--un-ring-width:0px;--un-ring-color:rgb(147 197 253 / .5);--un-blur: ;--un-brightness: ;--un-contrast: ;--un-drop-shadow: ;--un-grayscale: ;--un-hue-rotate: ;--un-invert: ;--un-saturate: ;--un-sepia: ;--un-backdrop-blur: ;--un-backdrop-brightness: ;--un-backdrop-contrast: ;--un-backdrop-grayscale: ;--un-backdrop-hue-rotate: ;--un-backdrop-invert: ;--un-backdrop-opacity: ;--un-backdrop-saturate: ;--un-backdrop-sepia: }@font-face{font-family:DM Mono;font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmmono/v15/aFTU7PB1QTsUX8KYthSQBK6PYK3EXw.woff2) format("woff2");unicode-range:U+0100-02BA,U+02BD-02C5,U+02C7-02CC,U+02CE-02D7,U+02DD-02FF,U+0304,U+0308,U+0329,U+1D00-1DBF,U+1E00-1E9F,U+1EF2-1EFF,U+2020,U+20A0-20AB,U+20AD-20C0,U+2113,U+2C60-2C7F,U+A720-A7FF}@font-face{font-family:DM Mono;font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmmono/v15/aFTU7PB1QTsUX8KYthqQBK6PYK0.woff2) format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+0304,U+0308,U+0329,U+2000-206F,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}@font-face{font-family:DM Sans;font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmsans/v16/rP2tp2ywxg089UriI5-g4vlH9VoD8CmcqZG40F9JadbnoEwAopxRR232RmYJp8I5zzw.woff2) format("woff2");unicode-range:U+0100-02BA,U+02BD-02C5,U+02C7-02CC,U+02CE-02D7,U+02DD-02FF,U+0304,U+0308,U+0329,U+1D00-1DBF,U+1E00-1E9F,U+1EF2-1EFF,U+2020,U+20A0-20AB,U+20AD-20C0,U+2113,U+2C60-2C7F,U+A720-A7FF}@font-face{font-family:DM Sans;font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmsans/v16/rP2tp2ywxg089UriI5-g4vlH9VoD8CmcqZG40F9JadbnoEwAopxRSW32RmYJp8I5.woff2) format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+0304,U+0308,U+0329,U+2000-206F,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}@font-face{font-family:"DM Serif Display";font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmserifdisplay/v16/-nFnOHM81r4j6k0gjAW3mujVU2B2G_5x0vrx52jJ3Q.woff2) format("woff2");unicode-range:U+0100-02BA,U+02BD-02C5,U+02C7-02CC,U+02CE-02D7,U+02DD-02FF,U+0304,U+0308,U+0329,U+1D00-1DBF,U+1E00-1E9F,U+1EF2-1EFF,U+2020,U+20A0-20AB,U+20AD-20C0,U+2113,U+2C60-2C7F,U+A720-A7FF}@font-face{font-family:"DM Serif Display";font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmserifdisplay/v16/-nFnOHM81r4j6k0gjAW3mujVU2B2G_Bx0vrx52g.woff2) format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+0304,U+0308,U+0329,U+2000-206F,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}.i-solar\:end-call-rounded-bold,[i-solar\:end-call-rounded-bold=""]{--un-icon:url("data:image/svg+xml;utf8,%3Csvg viewBox='0 0 24 24' width='1.2em' height='1.2em' xmlns='http://www.w3.org/2000/svg' %3E%3Cpath fill='currentColor' d='m5.607 16.897l1.34-.38C8.156 16.174 9 14.983 9 13.618c0 0 0-1.654 3-1.654s3 1.654 3 1.654c0 1.365.844 2.556 2.053 2.9l1.34.38C20.218 17.414 22 15.91 22 13.85c0-1.237-.277-2.477-1.083-3.347C19.56 9.04 16.807 7 12 7s-7.56 2.039-8.917 3.503C2.277 11.373 2 12.613 2 13.85c0 2.06 1.782 3.565 3.607 3.047'/%3E%3C/svg%3E");-webkit-mask:var(--un-icon) no-repeat;mask:var(--un-icon) no-repeat;-webkit-mask-size:100% 100%;mask-size:100% 100%;background-color:currentColor;color:inherit;width:1.2em;height:1.2em}.i-solar\:phone-bold,[i-solar\:phone-bold=""]{--un-icon:url("data:image/svg+xml;utf8,%3Csvg viewBox='0 0 24 24' width='1.2em' height='1.2em' xmlns='http://www.w3.org/2000/svg' %3E%3Cpath fill='currentColor' d='m16.556 12.906l-.455.453s-1.083 1.076-4.038-1.862s-1.872-4.014-1.872-4.014l.286-.286c.707-.702.774-1.83.157-2.654L9.374 2.86C8.61 1.84 7.135 1.705 6.26 2.575l-1.57 1.56c-.433.432-.723.99-.688 1.61c.09 1.587.808 5 4.812 8.982c4.247 4.222 8.232 4.39 9.861 4.238c.516-.048.964-.31 1.325-.67l1.42-1.412c.96-.953.69-2.588-.538-3.255l-1.91-1.039c-.806-.437-1.787-.309-2.417.317'/%3E%3C/svg%3E");-webkit-mask:var(--un-icon) no-repeat;mask:var(--un-icon) no-repeat;-webkit-mask-size:100% 100%;mask-size:100% 100%;background-color:currentColor;color:inherit;width:1.2em;height:1.2em}.i-svg-spinners\:3-dots-bounce,[i-svg-spinners\:3-dots-bounce=""]{--un-icon:url("data:image/svg+xml;utf8,%3Csvg viewBox='0 0 24 24' width='1.2em' height='1.2em' xmlns='http://www.w3.org/2000/svg' %3E%3Ccircle cx='4' cy='12' r='3' fill='currentColor'%3E%3Canimate id='svgSpinners3DotsBounce0' attributeName='cy' begin='0;svgSpinners3DotsBounce1.end+0.25s' calcMode='spline' dur='0.6s' keySplines='.33,.66,.66,1;.33,0,.66,.33' values='12;6;12'/%3E%3C/circle%3E%3Ccircle cx='12' cy='12' r='3' fill='currentColor'%3E%3Canimate attributeName='cy' begin='svgSpinners3DotsBounce0.begin+0.1s' calcMode='spline' dur='0.6s' keySplines='.33,.66,.66,1;.33,0,.66,.33' values='12;6;12'/%3E%3C/circle%3E%3Ccircle cx='20' cy='12' r='3' fill='currentColor'%3E%3Canimate id='svgSpinners3DotsBounce1' attributeName='cy' begin='svgSpinners3DotsBounce0.begin+0.2s' calcMode='spline' dur='0.6s' keySplines='.33,.66,.66,1;.33,0,.66,.33' values='12;6;12'/%3E%3C/circle%3E%3C/svg%3E");-webkit-mask:var(--un-icon) no-repeat;mask:var(--un-icon) no-repeat;-webkit-mask-size:100% 100%;mask-size:100% 100%;background-color:currentColor;color:inherit;width:1.2em;height:1.2em}.prose :where(h1,h2,h3,h4,h5,h6):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-headings);font-weight:600;line-height:1.25}.prose :where(a):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-links);text-decoration:underline;font-weight:500}.prose :where(a code):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-links)}.prose :where(p,ul,ol,pre):not(:where(.not-prose,.not-prose *)){margin:1em 0;line-height:1.75}.prose :where(blockquote):not(:where(.not-prose,.not-prose *)){margin:1em 0;padding-left:1em;font-style:italic;border-left:.25em solid var(--un-prose-borders)}.prose :where(h1):not(:where(.not-prose,.not-prose *)){margin:1rem 0;font-size:2.25em}.prose :where(h2):not(:where(.not-prose,.not-prose *)){margin:1.75em 0 .5em;font-size:1.75em}.prose :where(h3):not(:where(.not-prose,.not-prose *)){margin:1.5em 0 .5em;font-size:1.375em}.prose :where(h4):not(:where(.not-prose,.not-prose *)){margin:1em 0;font-size:1.125em}.prose :where(img,video):not(:where(.not-prose,.not-prose *)){max-width:100%}.prose :where(figure,picture):not(:where(.not-prose,.not-prose *)){margin:1em 0}.prose :where(figcaption):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-captions);font-size:.875em}.prose :where(code):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-code);font-size:.875em;font-weight:600;font-family:DM Mono,ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace}.prose :where(:not(pre)>code):not(:where(.not-prose,.not-prose *)):before,.prose :where(:not(pre)>code):not(:where(.not-prose,.not-prose *)):after{content:"`"}.prose :where(pre):not(:where(.not-prose,.not-prose *)){padding:1.25rem 1.5rem;overflow-x:auto;border-radius:.375rem}.prose :where(pre,code):not(:where(.not-prose,.not-prose *)){white-space:pre;word-spacing:normal;word-break:normal;word-wrap:normal;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-hyphens:none;-moz-hyphens:none;hyphens:none;background:transparent}.prose :where(pre code):not(:where(.not-prose,.not-prose *)){font-weight:inherit}.prose :where(ol,ul):not(:where(.not-prose,.not-prose *)){padding-left:1.25em}.prose :where(ol):not(:where(.not-prose,.not-prose *)){list-style-type:decimal}.prose :where(ol[type=A]):not(:where(.not-prose,.not-prose *)){list-style-type:upper-alpha}.prose :where(ol[type=a]):not(:where(.not-prose,.not-prose *)){list-style-type:lower-alpha}.prose :where(ol[type=A s]):not(:where(.not-prose,.not-prose *)){list-style-type:upper-alpha}.prose :where(ol[type=a s]):not(:where(.not-prose,.not-prose *)){list-style-type:lower-alpha}.prose :where(ol[type=I]):not(:where(.not-prose,.not-prose *)){list-style-type:upper-roman}.prose :where(ol[type=i]):not(:where(.not-prose,.not-prose *)){list-style-type:lower-roman}.prose :where(ol[type=I s]):not(:where(.not-prose,.not-prose *)){list-style-type:upper-roman}.prose :where(ol[type=i s]):not(:where(.not-prose,.not-prose *)){list-style-type:lower-roman}.prose :where(ol[type="1"]):not(:where(.not-prose,.not-prose *)){list-style-type:decimal}.prose :where(ul):not(:where(.not-prose,.not-prose *)){list-style-type:disc}.prose :where(ol>li):not(:where(.not-prose,.not-prose *))::marker,.prose :where(ul>li):not(:where(.not-prose,.not-prose *))::marker,.prose :where(summary):not(:where(.not-prose,.not-prose *))::marker{color:var(--un-prose-lists)}.prose :where(hr):not(:where(.not-prose,.not-prose *)){margin:2em 0;border:1px solid var(--un-prose-hr)}.prose :where(table):not(:where(.not-prose,.not-prose *)){display:block;margin:1em 0;border-collapse:collapse;overflow-x:auto}.prose :where(tr):not(:where(.not-prose,.not-prose *)):nth-child(2n){background:var(--un-prose-bg-soft)}.prose :where(td,th):not(:where(.not-prose,.not-prose *)){border:1px solid var(--un-prose-borders);padding:.625em 1em}.prose :where(abbr):not(:where(.not-prose,.not-prose *)){cursor:help}.prose :where(kbd):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-code);border:1px solid;padding:.25rem .5rem;font-size:.875em;border-radius:.25rem}.prose :where(details):not(:where(.not-prose,.not-prose *)){margin:1em 0;padding:1.25rem 1.5rem;background:var(--un-prose-bg-soft)}.prose :where(summary):not(:where(.not-prose,.not-prose *)){cursor:pointer;font-weight:600}.prose{color:var(--un-prose-body);max-width:65ch}.pointer-events-none{pointer-events:none}.absolute{position:absolute}.relative{position:relative}.inset-0{top:0;right:0;bottom:0;left:0}.z-10{z-index:10}.m-auto{margin:auto}.-mt-4{margin-top:-1rem}.mb-4{margin-bottom:1rem}.hidden{display:none}.aspect-square{aspect-ratio:1/1}.h-100dvh{height:100dvh}.h-32{height:8rem}.h-50{height:12.5rem}.h-80{height:20rem}.h-full,[h-full=""]{height:100%}.min-w-0{min-width:0}.w-100dvw{width:100dvw}.w-120{width:30rem}.w-140{width:35rem}.w-32{width:8rem}.w-fit,[w-fit=""]{width:fit-content}.w-full{width:100%}.flex,[flex=""]{display:flex}.flex-\[0_0_80\%\]{flex:0 0 80%}.flex-shrink-0,.shrink-0{flex-shrink:0}.grow-0{flex-grow:0}.basis-full{flex-basis:100%}.flex-col,[flex-col=""]{flex-direction:column}.transform{transform:translate(var(--un-translate-x)) translateY(var(--un-translate-y)) translateZ(var(--un-translate-z)) rotate(var(--un-rotate)) rotateX(var(--un-rotate-x)) rotateY(var(--un-rotate-y)) rotate(var(--un-rotate-z)) skew(var(--un-skew-x)) skewY(var(--un-skew-y)) scaleX(var(--un-scale-x)) scaleY(var(--un-scale-y)) scaleZ(var(--un-scale-z))}@keyframes ping{0%{transform:scale(1);opacity:1}75%,to{transform:scale(2);opacity:0}}.animate-ping{animation:ping 1s cubic-bezier(0,0,.2,1) infinite}.cursor-pointer{cursor:pointer}.items-center,[items-center=""]{align-items:center}.justify-center,[justify-center=""]{justify-content:center}.justify-between,[justify-between=""]{justify-content:space-between}.gap-2,[gap-2=""]{gap:.5rem}.gap-4{gap:1rem}.overflow-hidden{overflow:hidden}.border-2{border-width:2px}.border-cyan-200{--un-border-opacity:1;border-color:rgb(165 243 252 / var(--un-border-opacity))}.dark .dark\:border-cyan-500{--un-border-opacity:1;border-color:rgb(6 182 212 / var(--un-border-opacity))}.rounded-full{border-radius:9999px}.rounded-lg,[rounded-lg=""]{border-radius:.5rem}.rounded-xl,[rounded-xl=""]{border-radius:.75rem}.bg-cyan-200{--un-bg-opacity:1;background-color:rgb(165 243 252 / var(--un-bg-opacity))}.bg-cyan-300{--un-bg-opacity:1;background-color:rgb(103 232 249 / var(--un-bg-opacity))}.bg-red-200{--un-bg-opacity:1;background-color:rgb(254 202 202 / var(--un-bg-opacity))}.bg-red-300{--un-bg-opacity:1;background-color:rgb(252 165 165 / var(--un-bg-opacity))}.dark .dark\:bg-cyan-600{--un-bg-opacity:1;background-color:rgb(8 145 178 / var(--un-bg-opacity))}.dark .dark\:bg-cyan-800{--un-bg-opacity:1;background-color:rgb(21 94 117 / var(--un-bg-opacity))}.dark .dark\:bg-red-400{--un-bg-opacity:1;background-color:rgb(248 113 113 / var(--un-bg-opacity))}.dark [bg~="dark:cyan-950"]{--un-bg-opacity:1;background-color:rgb(8 51 68 / var(--un-bg-opacity))}[bg~=cyan-50]{--un-bg-opacity:1;background-color:rgb(236 254 255 / var(--un-bg-opacity))}.dark [bg~="dark:hover:cyan-900"]:hover{--un-bg-opacity:1;background-color:rgb(22 78 99 / var(--un-bg-opacity))}[bg~="hover:cyan-100"]:hover{--un-bg-opacity:1;background-color:rgb(207 250 254 / var(--un-bg-opacity))}.p-4,[p-4=""]{padding:1rem}.px-1,[px-1=""]{padding-left:.25rem;padding-right:.25rem}.px-16{padding-left:4rem;padding-right:4rem}.px-4,[px-4=""]{padding-left:1rem;padding-right:1rem}.px-8{padding-left:2rem;padding-right:2rem}.py-1,[py-1=""]{padding-top:.25rem;padding-bottom:.25rem}.py-2,[py-2=""]{padding-top:.5rem;padding-bottom:.5rem}.pl-0{padding-left:0}.pt-4{padding-top:1rem}.text-center{text-align:center}.text-left{text-align:left}.text-2xl,[text-2xl=""]{font-size:1.5rem;line-height:2rem}.text-lg,[text-lg=""]{font-size:1.125rem;line-height:1.75rem}.text-sm,[text-sm=""]{font-size:.875rem;line-height:1.25rem}.dark .dark\:text-white,.dark [text~="dark:white"]{--un-text-opacity:1;color:rgb(255 255 255 / var(--un-text-opacity))}.dark [text~="dark:cyan-500"]{--un-text-opacity:1;color:rgb(6 182 212 / var(--un-text-opacity))}.text-gray-700{--un-text-opacity:1;color:rgb(55 65 81 / var(--un-text-opacity))}.text-red-700{--un-text-opacity:1;color:rgb(185 28 28 / var(--un-text-opacity))}[text~=black]{--un-text-opacity:1;color:rgb(0 0 0 / var(--un-text-opacity))}[text~=cyan-400]{--un-text-opacity:1;color:rgb(34 211 238 / var(--un-text-opacity))}[text~=red-400]{--un-text-opacity:1;color:rgb(248 113 113 / var(--un-text-opacity))}[text~="hover:red-300"]:hover{--un-text-opacity:1;color:rgb(252 165 165 / var(--un-text-opacity))}[text~="active:red-400"]:active{--un-text-opacity:1;color:rgb(248 113 113 / var(--un-text-opacity))}.font-sans{font-family:DM Sans,ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji"}.opacity-0{opacity:0}.opacity-75{opacity:.75}.shadow-inner{--un-shadow:inset 0 2px 4px 0 var(--un-shadow-color, rgb(0 0 0 / .05));box-shadow:var(--un-ring-offset-shadow),var(--un-ring-shadow),var(--un-shadow)}.outline-none,[outline-none=""]{outline:2px solid transparent;outline-offset:2px}.transition{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-transform{transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}[transition~=all]{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-300,[transition~=duration-300]{transition-duration:.3s}.duration-500,[transition~=duration-500]{transition-duration:.5s}.ease,.ease-in-out,[transition~=ease-in-out]{transition-timing-function:cubic-bezier(.4,0,.2,1)}.ease-out{transition-timing-function:cubic-bezier(0,0,.2,1)}
|
assets/play-worklet-CqUYQx_r.js
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
var c=Object.defineProperty;var l=(s,e,t)=>e in s?c(s,e,{enumerable:!0,configurable:!0,writable:!0,value:t}):s[e]=t;var n=(s,e,t)=>l(s,typeof e!="symbol"?e+"":e,t);class d extends AudioWorkletProcessor{constructor(){super();n(this,"bufferQueue",[]);n(this,"currentChunkOffset",0);n(this,"hadData",!1);this.bufferQueue=[],this.currentChunkOffset=0,this.hadData=!1,this.port.onmessage=t=>{const r=t.data;r instanceof Float32Array?(this.hadData=!0,this.bufferQueue.push(r)):r==="stop"&&(this.bufferQueue=[],this.currentChunkOffset=0)}}process(t,r){const f=r[0][0];if(!f)return!0;const h=f.length;let u=0;for(this.hadData&&this.bufferQueue.length===0&&(this.port.postMessage({type:"playback_ended"}),this.hadData=!1);u<h;)if(this.bufferQueue.length>0){const a=this.bufferQueue[0],o=a.length-this.currentChunkOffset,i=Math.min(o,h-u);f.set(a.subarray(this.currentChunkOffset,this.currentChunkOffset+i),u),this.currentChunkOffset+=i,u+=i,this.currentChunkOffset>=a.length&&(this.bufferQueue.shift(),this.currentChunkOffset=0)}else f.fill(0,u),u=h;return!0}}registerProcessor("buffered-audio-worklet-processor",d);
|
assets/vad-processor-0sEQXaXZ.js
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
let s=0;const r=new Float32Array(512);class l extends AudioWorkletProcessor{process(o,n,f){const e=o[0][0];if(!e)return!1;if(e.length>512)this.port.postMessage({buffer:e});else{const t=512-s;e.length>=t?(r.set(e.subarray(0,t),s),this.port.postMessage({buffer:r}),r.fill(0),r.set(e.subarray(t),0),s=e.length-t):(r.set(e,s),s+=e.length)}return!0}}registerProcessor("vad-processor",l);
|
assets/worker-yoCrhISy.ts
ADDED
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type {
|
2 |
+
AutomaticSpeechRecognitionPipeline,
|
3 |
+
CausalLMOutputWithPast,
|
4 |
+
GPT2Tokenizer,
|
5 |
+
LlamaForCausalLM,
|
6 |
+
PreTrainedModel,
|
7 |
+
StoppingCriteriaList,
|
8 |
+
} from '@huggingface/transformers'
|
9 |
+
import type { Device, DType } from '@xsai-transformers/shared/types'
|
10 |
+
import type { GenerateOptions } from 'kokoro-js'
|
11 |
+
import type {
|
12 |
+
WorkerMessageEventError,
|
13 |
+
WorkerMessageEventInfo,
|
14 |
+
WorkerMessageEventOutput,
|
15 |
+
WorkerMessageEventProgress,
|
16 |
+
WorkerMessageEventSetVoiceResponse,
|
17 |
+
WorkerMessageEventStatus,
|
18 |
+
} from '../types/worker'
|
19 |
+
|
20 |
+
import {
|
21 |
+
// VAD
|
22 |
+
AutoModel,
|
23 |
+
|
24 |
+
AutoModelForCausalLM,
|
25 |
+
// LLM
|
26 |
+
AutoTokenizer,
|
27 |
+
InterruptableStoppingCriteria,
|
28 |
+
pipeline,
|
29 |
+
|
30 |
+
// Speech recognition
|
31 |
+
Tensor,
|
32 |
+
TextStreamer,
|
33 |
+
} from '@huggingface/transformers'
|
34 |
+
import { isWebGPUSupported } from 'gpuu/webgpu'
|
35 |
+
import { KokoroTTS, TextSplitterStream } from 'kokoro-js'
|
36 |
+
|
37 |
+
import {
|
38 |
+
EXIT_THRESHOLD,
|
39 |
+
INPUT_SAMPLE_RATE,
|
40 |
+
MAX_BUFFER_DURATION,
|
41 |
+
MAX_NUM_PREV_BUFFERS,
|
42 |
+
MIN_SILENCE_DURATION_SAMPLES,
|
43 |
+
MIN_SPEECH_DURATION_SAMPLES,
|
44 |
+
SPEECH_PAD_SAMPLES,
|
45 |
+
SPEECH_THRESHOLD,
|
46 |
+
} from '../constants'
|
47 |
+
|
48 |
+
interface Message {
|
49 |
+
role: 'system' | 'user' | 'assistant'
|
50 |
+
content: string
|
51 |
+
}
|
52 |
+
|
53 |
+
type Voices = GenerateOptions['voice']
|
54 |
+
export type PretrainedConfig = NonNullable<Parameters<typeof AutoModel.from_pretrained>[1]>['config']
|
55 |
+
|
56 |
+
const whisperDtypeMap: Record<Device, DType> = {
|
57 |
+
webgpu: {
|
58 |
+
encoder_model: 'fp32',
|
59 |
+
decoder_model_merged: 'fp32',
|
60 |
+
},
|
61 |
+
wasm: {
|
62 |
+
encoder_model: 'fp32',
|
63 |
+
decoder_model_merged: 'q8',
|
64 |
+
},
|
65 |
+
}
|
66 |
+
|
67 |
+
const model_id = 'onnx-community/Kokoro-82M-v1.0-ONNX'
|
68 |
+
let voice: Voices | undefined
|
69 |
+
let silero_vad: PreTrainedModel
|
70 |
+
let transcriber: AutomaticSpeechRecognitionPipeline
|
71 |
+
let tts: KokoroTTS
|
72 |
+
|
73 |
+
const SYSTEM_MESSAGE: Message = {
|
74 |
+
role: 'system',
|
75 |
+
content:
|
76 |
+
'You\'re a helpful and conversational voice assistant. Keep your responses short, clear, and casual.',
|
77 |
+
}
|
78 |
+
let messages: Message[] = [SYSTEM_MESSAGE]
|
79 |
+
let past_key_values_cache: any = null
|
80 |
+
let stopping_criteria: InterruptableStoppingCriteria | null = null
|
81 |
+
|
82 |
+
// Global audio buffer to store incoming audio
|
83 |
+
const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE)
|
84 |
+
let bufferPointer = 0
|
85 |
+
|
86 |
+
// Initial state for VAD
|
87 |
+
const sr = new Tensor('int64', [INPUT_SAMPLE_RATE], [])
|
88 |
+
let state = new Tensor('float32', new Float32Array(2 * 1 * 128), [2, 1, 128])
|
89 |
+
|
90 |
+
// Whether we are in the process of adding audio to the buffer
|
91 |
+
let isRecording = false
|
92 |
+
let isPlaying = false // new flag
|
93 |
+
|
94 |
+
let tokenizer: GPT2Tokenizer
|
95 |
+
let llm: LlamaForCausalLM
|
96 |
+
|
97 |
+
const prevBuffers: Float32Array[] = []
|
98 |
+
|
99 |
+
export async function loadModels() {
|
100 |
+
tts = await KokoroTTS.from_pretrained(model_id, {
|
101 |
+
dtype: 'fp32',
|
102 |
+
device: 'webgpu',
|
103 |
+
})
|
104 |
+
|
105 |
+
const device = 'webgpu'
|
106 |
+
globalThis.postMessage({ type: 'info', data: { message: `Using device: "${device}"` } } satisfies WorkerMessageEventInfo)
|
107 |
+
globalThis.postMessage({ type: 'info', data: { message: 'Loading models...', duration: 'until_next' } } satisfies WorkerMessageEventInfo)
|
108 |
+
|
109 |
+
// Load models
|
110 |
+
silero_vad = await AutoModel.from_pretrained(
|
111 |
+
'onnx-community/silero-vad',
|
112 |
+
{
|
113 |
+
config: { model_type: 'custom' } as PretrainedConfig,
|
114 |
+
dtype: 'fp32', // Full-precision
|
115 |
+
progress_callback: progress => globalThis.postMessage({ type: 'progress', data: { message: progress } } satisfies WorkerMessageEventProgress),
|
116 |
+
},
|
117 |
+
).catch((error: Error) => {
|
118 |
+
globalThis.postMessage({ type: 'error', data: { error, message: error.message } } satisfies WorkerMessageEventError<Error>)
|
119 |
+
throw error
|
120 |
+
})
|
121 |
+
|
122 |
+
transcriber = await pipeline(
|
123 |
+
'automatic-speech-recognition',
|
124 |
+
'onnx-community/whisper-base', // or "onnx-community/moonshine-base-ONNX",
|
125 |
+
{
|
126 |
+
device,
|
127 |
+
dtype: whisperDtypeMap[device as keyof typeof whisperDtypeMap],
|
128 |
+
progress_callback: progress => globalThis.postMessage({ type: 'progress', data: { message: progress } } satisfies WorkerMessageEventProgress),
|
129 |
+
},
|
130 |
+
).catch((error: Error) => {
|
131 |
+
globalThis.postMessage({ type: 'error', data: { error, message: error.message } } satisfies WorkerMessageEventError<Error>)
|
132 |
+
throw error
|
133 |
+
})
|
134 |
+
|
135 |
+
await transcriber(new Float32Array(INPUT_SAMPLE_RATE)) // Compile shaders
|
136 |
+
|
137 |
+
llm = await AutoModelForCausalLM.from_pretrained(
|
138 |
+
'HuggingFaceTB/SmolLM2-1.7B-Instruct',
|
139 |
+
{
|
140 |
+
dtype: await isWebGPUSupported() ? 'q4f16' : 'int8',
|
141 |
+
device: await isWebGPUSupported() ? 'webgpu' : 'wasm',
|
142 |
+
progress_callback: progress => globalThis.postMessage({ type: 'progress', data: { message: progress } } satisfies WorkerMessageEventProgress),
|
143 |
+
},
|
144 |
+
).catch((error: Error) => {
|
145 |
+
globalThis.postMessage({ type: 'error', data: { error, message: error.message } } satisfies WorkerMessageEventError<Error>)
|
146 |
+
throw error
|
147 |
+
})
|
148 |
+
|
149 |
+
tokenizer = await AutoTokenizer.from_pretrained(
|
150 |
+
'HuggingFaceTB/SmolLM2-1.7B-Instruct',
|
151 |
+
).catch((error: Error) => {
|
152 |
+
globalThis.postMessage({ type: 'error', data: { error, message: error.message } } satisfies WorkerMessageEventError<Error>)
|
153 |
+
throw error
|
154 |
+
})
|
155 |
+
|
156 |
+
await llm.generate({ ...tokenizer('x'), max_new_tokens: 1 }) // Compile shaders
|
157 |
+
|
158 |
+
globalThis.postMessage({
|
159 |
+
type: 'status',
|
160 |
+
data: {
|
161 |
+
status: 'ready',
|
162 |
+
message: 'Ready!',
|
163 |
+
voices: tts.voices,
|
164 |
+
},
|
165 |
+
} as WorkerMessageEventStatus)
|
166 |
+
}
|
167 |
+
|
168 |
+
loadModels()
|
169 |
+
|
170 |
+
/**
|
171 |
+
* Perform Voice Activity Detection (VAD)
|
172 |
+
* @param buffer The new audio buffer
|
173 |
+
* @returns `true` if the buffer is speech, `false` otherwise.
|
174 |
+
*/
|
175 |
+
async function vad(buffer?: Float32Array): Promise<boolean> {
|
176 |
+
if (!buffer) {
|
177 |
+
// Possibly closed or interrupted
|
178 |
+
return false
|
179 |
+
}
|
180 |
+
|
181 |
+
const input = new Tensor('float32', buffer, [1, buffer.length])
|
182 |
+
|
183 |
+
const { stateN, output } = await silero_vad({ input, sr, state })
|
184 |
+
state = stateN // Update state
|
185 |
+
|
186 |
+
const isSpeech = output.data[0]
|
187 |
+
|
188 |
+
// Use heuristics to determine if the buffer is speech or not
|
189 |
+
return (
|
190 |
+
// Case 1: We are above the threshold (definitely speech)
|
191 |
+
isSpeech > SPEECH_THRESHOLD
|
192 |
+
// Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
|
193 |
+
|| (isRecording && isSpeech >= EXIT_THRESHOLD)
|
194 |
+
)
|
195 |
+
}
|
196 |
+
|
197 |
+
interface SpeechData {
|
198 |
+
start: number
|
199 |
+
end: number
|
200 |
+
duration: number
|
201 |
+
}
|
202 |
+
|
203 |
+
type BatchEncodingItem = number[] | number[][] | Tensor
|
204 |
+
/**
|
205 |
+
* Holds the output of the tokenizer's call function.
|
206 |
+
*/
|
207 |
+
interface BatchEncoding {
|
208 |
+
/**
|
209 |
+
* List of token ids to be fed to a model.
|
210 |
+
*/
|
211 |
+
input_ids: BatchEncodingItem
|
212 |
+
/**
|
213 |
+
* List of indices specifying which tokens should be attended to by the model.
|
214 |
+
*/
|
215 |
+
attention_mask: BatchEncodingItem
|
216 |
+
/**
|
217 |
+
* List of token type ids to be fed to a model.
|
218 |
+
*/
|
219 |
+
token_type_ids?: BatchEncodingItem
|
220 |
+
}
|
221 |
+
|
222 |
+
/**
|
223 |
+
* Transcribe the audio buffer
|
224 |
+
* @param buffer The audio buffer
|
225 |
+
* @param _data Additional data
|
226 |
+
*/
|
227 |
+
async function speechToSpeech(buffer: Float32Array, _data: SpeechData): Promise<void> {
|
228 |
+
isPlaying = true
|
229 |
+
|
230 |
+
// 1. Transcribe the audio from the user
|
231 |
+
const result = await transcriber(buffer)
|
232 |
+
const text = (result as { text: string }).text.trim()
|
233 |
+
|
234 |
+
if (['', '[BLANK_AUDIO]'].includes(text)) {
|
235 |
+
// If the transcription is empty or a blank audio, we skip the rest of the processing
|
236 |
+
return
|
237 |
+
}
|
238 |
+
|
239 |
+
messages.push({ role: 'user', content: text })
|
240 |
+
|
241 |
+
// Set up text-to-speech streaming
|
242 |
+
const splitter = new TextSplitterStream()
|
243 |
+
const stream = tts!.stream(splitter, { voice });
|
244 |
+
(async () => {
|
245 |
+
for await (const { text, audio } of stream) {
|
246 |
+
globalThis.postMessage({ type: 'output', data: { text, result: audio } } satisfies WorkerMessageEventOutput)
|
247 |
+
}
|
248 |
+
})()
|
249 |
+
|
250 |
+
// 2. Generate a response using the LLM
|
251 |
+
const inputs = tokenizer.apply_chat_template(messages, {
|
252 |
+
add_generation_prompt: true,
|
253 |
+
return_dict: true,
|
254 |
+
}) as BatchEncoding
|
255 |
+
|
256 |
+
const streamer = new TextStreamer(tokenizer, {
|
257 |
+
skip_prompt: true,
|
258 |
+
skip_special_tokens: true,
|
259 |
+
callback_function: (text: string) => {
|
260 |
+
splitter.push(text)
|
261 |
+
},
|
262 |
+
token_callback_function: () => {},
|
263 |
+
})
|
264 |
+
|
265 |
+
stopping_criteria = new InterruptableStoppingCriteria()
|
266 |
+
type GenerationFunctionParameters = Parameters<typeof llm.generate>[0] & Record<string, any>
|
267 |
+
|
268 |
+
const generatedRes = await llm.generate({
|
269 |
+
...inputs,
|
270 |
+
past_key_values: past_key_values_cache,
|
271 |
+
do_sample: false, // TODO: do_sample: true is bugged (invalid data location on top-k sample)
|
272 |
+
max_new_tokens: 1024,
|
273 |
+
streamer,
|
274 |
+
stopping_criteria: stopping_criteria as unknown as StoppingCriteriaList,
|
275 |
+
return_dict_in_generate: true,
|
276 |
+
} as GenerationFunctionParameters)
|
277 |
+
|
278 |
+
const { past_key_values, sequences } = generatedRes as CausalLMOutputWithPast & { sequences: Tensor }
|
279 |
+
past_key_values_cache = past_key_values
|
280 |
+
|
281 |
+
// Finally, close the stream to signal that no more text will be added.
|
282 |
+
splitter.close()
|
283 |
+
|
284 |
+
const decoded = tokenizer.batch_decode(
|
285 |
+
// TODO: fix null as any
|
286 |
+
sequences.slice(null, [(inputs.input_ids as Tensor).dims[1], null as any]),
|
287 |
+
{ skip_special_tokens: true },
|
288 |
+
)
|
289 |
+
|
290 |
+
messages.push({ role: 'assistant', content: decoded[0] })
|
291 |
+
}
|
292 |
+
|
293 |
+
// Track the number of samples after the last speech chunk
|
294 |
+
let postSpeechSamples = 0
|
295 |
+
function resetAfterRecording(offset = 0): void {
|
296 |
+
globalThis.postMessage({
|
297 |
+
type: 'status',
|
298 |
+
data: {
|
299 |
+
status: 'recording_end',
|
300 |
+
message: 'Transcribing...',
|
301 |
+
duration: 'until_next',
|
302 |
+
},
|
303 |
+
} satisfies WorkerMessageEventStatus)
|
304 |
+
|
305 |
+
BUFFER.fill(0, offset)
|
306 |
+
bufferPointer = offset
|
307 |
+
isRecording = false
|
308 |
+
postSpeechSamples = 0
|
309 |
+
}
|
310 |
+
|
311 |
+
function dispatchForTranscriptionAndResetAudioBuffer(overflow?: Float32Array): void {
|
312 |
+
// Get start and end time of the speech segment, minus the padding
|
313 |
+
const now = Date.now()
|
314 |
+
const end
|
315 |
+
= now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000
|
316 |
+
const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000
|
317 |
+
const duration = end - start
|
318 |
+
const overflowLength = overflow?.length ?? 0
|
319 |
+
|
320 |
+
// Send the audio buffer to the worker
|
321 |
+
const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES)
|
322 |
+
|
323 |
+
const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0)
|
324 |
+
const paddedBuffer = new Float32Array(prevLength + buffer.length)
|
325 |
+
let offset = 0
|
326 |
+
for (const prev of prevBuffers) {
|
327 |
+
paddedBuffer.set(prev, offset)
|
328 |
+
offset += prev.length
|
329 |
+
}
|
330 |
+
paddedBuffer.set(buffer, offset)
|
331 |
+
speechToSpeech(paddedBuffer, { start, end, duration })
|
332 |
+
|
333 |
+
// Set overflow (if present) and reset the rest of the audio buffer
|
334 |
+
if (overflow) {
|
335 |
+
BUFFER.set(overflow, 0)
|
336 |
+
}
|
337 |
+
resetAfterRecording(overflowLength)
|
338 |
+
}
|
339 |
+
|
340 |
+
globalThis.onmessage = async (event: MessageEvent) => {
|
341 |
+
const { type, buffer } = event.data
|
342 |
+
|
343 |
+
// refuse new audio while playing back
|
344 |
+
if (type === 'audio' && isPlaying)
|
345 |
+
return
|
346 |
+
|
347 |
+
switch (type) {
|
348 |
+
case 'start_call': {
|
349 |
+
const name = tts!.voices[voice ?? 'af_heart']?.name ?? 'Heart'
|
350 |
+
greet(`Hey there, my name is ${name}! How can I help you today?`)
|
351 |
+
return
|
352 |
+
}
|
353 |
+
case 'end_call':
|
354 |
+
messages = [SYSTEM_MESSAGE]
|
355 |
+
past_key_values_cache = null
|
356 |
+
break
|
357 |
+
case 'interrupt':
|
358 |
+
stopping_criteria?.interrupt()
|
359 |
+
return
|
360 |
+
case 'set_voice':
|
361 |
+
voice = event.data.voice
|
362 |
+
|
363 |
+
globalThis.postMessage({
|
364 |
+
type: 'set_voice_response',
|
365 |
+
data: {
|
366 |
+
ok: true,
|
367 |
+
},
|
368 |
+
} satisfies WorkerMessageEventSetVoiceResponse)
|
369 |
+
|
370 |
+
return
|
371 |
+
case 'playback_ended':
|
372 |
+
isPlaying = false
|
373 |
+
return
|
374 |
+
}
|
375 |
+
|
376 |
+
const wasRecording = isRecording // Save current state
|
377 |
+
const isSpeech = await vad(buffer)
|
378 |
+
|
379 |
+
if (!wasRecording && !isSpeech) {
|
380 |
+
// We are not recording, and the buffer is not speech,
|
381 |
+
// so we will probably discard the buffer. So, we insert
|
382 |
+
// into a FIFO queue with maximum size of PREV_BUFFER_SIZE
|
383 |
+
if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
|
384 |
+
// If the queue is full, we discard the oldest buffer
|
385 |
+
prevBuffers.shift()
|
386 |
+
}
|
387 |
+
prevBuffers.push(buffer)
|
388 |
+
return
|
389 |
+
}
|
390 |
+
|
391 |
+
const remaining = BUFFER.length - bufferPointer
|
392 |
+
if (buffer.length >= remaining) {
|
393 |
+
// The buffer is larger than (or equal to) the remaining space in the global buffer,
|
394 |
+
// so we perform transcription and copy the overflow to the global buffer
|
395 |
+
BUFFER.set(buffer.subarray(0, remaining), bufferPointer)
|
396 |
+
bufferPointer += remaining
|
397 |
+
|
398 |
+
// Dispatch the audio buffer
|
399 |
+
const overflow = buffer.subarray(remaining)
|
400 |
+
dispatchForTranscriptionAndResetAudioBuffer(overflow)
|
401 |
+
return
|
402 |
+
}
|
403 |
+
else {
|
404 |
+
// The buffer is smaller than the remaining space in the global buffer,
|
405 |
+
// so we copy it to the global buffer
|
406 |
+
BUFFER.set(buffer, bufferPointer)
|
407 |
+
bufferPointer += buffer.length
|
408 |
+
}
|
409 |
+
|
410 |
+
if (isSpeech) {
|
411 |
+
if (!isRecording) {
|
412 |
+
// Indicate start of recording
|
413 |
+
globalThis.postMessage({
|
414 |
+
type: 'status',
|
415 |
+
data: {
|
416 |
+
status: 'recording_start',
|
417 |
+
message: 'Listening...',
|
418 |
+
duration: 'until_next',
|
419 |
+
},
|
420 |
+
} satisfies WorkerMessageEventStatus)
|
421 |
+
}
|
422 |
+
|
423 |
+
// Start or continue recording
|
424 |
+
isRecording = true
|
425 |
+
postSpeechSamples = 0 // Reset the post-speech samples
|
426 |
+
|
427 |
+
return
|
428 |
+
}
|
429 |
+
|
430 |
+
postSpeechSamples += buffer.length
|
431 |
+
|
432 |
+
// At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
|
433 |
+
// So, we check whether we have reached the end of the current audio chunk.
|
434 |
+
if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
|
435 |
+
// There was a short pause, but not long enough to consider the end of a speech chunk
|
436 |
+
// (e.g., the speaker took a breath), so we continue recording
|
437 |
+
return
|
438 |
+
}
|
439 |
+
|
440 |
+
if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
|
441 |
+
// The entire buffer (including the new chunk) is smaller than the minimum
|
442 |
+
// duration of a speech chunk, so we can safely discard the buffer.
|
443 |
+
resetAfterRecording()
|
444 |
+
return
|
445 |
+
}
|
446 |
+
|
447 |
+
dispatchForTranscriptionAndResetAudioBuffer()
|
448 |
+
}
|
449 |
+
|
450 |
+
function greet(text: string): void {
|
451 |
+
isPlaying = true
|
452 |
+
|
453 |
+
const splitter = new TextSplitterStream()
|
454 |
+
const stream = tts!.stream(splitter, { voice });
|
455 |
+
|
456 |
+
(async () => {
|
457 |
+
for await (const { text: chunkText, audio } of stream) {
|
458 |
+
globalThis.postMessage({ type: 'output', data: { text: chunkText, result: audio } } satisfies WorkerMessageEventOutput)
|
459 |
+
}
|
460 |
+
})()
|
461 |
+
|
462 |
+
splitter.push(text)
|
463 |
+
splitter.close()
|
464 |
+
messages.push({ role: 'assistant', content: text })
|
465 |
+
}
|
favicon-96x96.png
ADDED
![]() |
favicon.svg
ADDED
|
index.html
CHANGED
@@ -1,19 +1,24 @@
|
|
1 |
<!doctype html>
|
2 |
-
<html>
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
19 |
</html>
|
|
|
1 |
<!doctype html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8" />
|
5 |
+
<title>Realtime Conversational WebGPU (Vue)</title>
|
6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=0" />
|
7 |
+
<link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96" />
|
8 |
+
<link rel="icon" type="image/svg+xml" href="/favicon.svg" />
|
9 |
+
<script>
|
10 |
+
;(function () {
|
11 |
+
const prefersDark = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches
|
12 |
+
const setting = localStorage.getItem('vueuse-color-scheme') || 'auto'
|
13 |
+
if (setting === 'dark' || (prefersDark && setting !== 'light'))
|
14 |
+
document.documentElement.classList.toggle('dark', true)
|
15 |
+
})()
|
16 |
+
</script>
|
17 |
+
<script type="module" crossorigin src="/assets/index-DGmKQH7N.js"></script>
|
18 |
+
<link rel="stylesheet" crossorigin href="/assets/index-cAxkOY9l.css">
|
19 |
+
</head>
|
20 |
+
<body class="font-sans">
|
21 |
+
<div id="app"></div>
|
22 |
+
<noscript> This website requires JavaScript to function properly. Please enable JavaScript to continue. </noscript>
|
23 |
+
</body>
|
24 |
</html>
|
style.css
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
body {
|
2 |
-
padding: 2rem;
|
3 |
-
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
4 |
-
}
|
5 |
-
|
6 |
-
h1 {
|
7 |
-
font-size: 16px;
|
8 |
-
margin-top: 0;
|
9 |
-
}
|
10 |
-
|
11 |
-
p {
|
12 |
-
color: rgb(107, 114, 128);
|
13 |
-
font-size: 15px;
|
14 |
-
margin-bottom: 10px;
|
15 |
-
margin-top: 5px;
|
16 |
-
}
|
17 |
-
|
18 |
-
.card {
|
19 |
-
max-width: 620px;
|
20 |
-
margin: 0 auto;
|
21 |
-
padding: 16px;
|
22 |
-
border: 1px solid lightgray;
|
23 |
-
border-radius: 16px;
|
24 |
-
}
|
25 |
-
|
26 |
-
.card p:last-child {
|
27 |
-
margin-bottom: 0;
|
28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|